def test_unknown_token(self): # pylint: disable=protected-access # We're putting this behavior in a test so that the behavior is documented. There is # solver code that depends in a small way on how we treat the unknown token, so any # breaking change to this behavior should break a test, so you know you've done something # that needs more consideration. vocab = Vocabulary() oov_token = vocab._oov_token oov_index = vocab.get_token_index(oov_token) assert oov_index == 1 assert vocab.get_token_index("unseen word") == oov_index
def test_set_from_file_reads_padded_files(self): # pylint: disable=protected-access vocab_filename = self.TEST_DIR / 'vocab_file' with codecs.open(vocab_filename, 'w', 'utf-8') as vocab_file: vocab_file.write('<S>\n') vocab_file.write('</S>\n') vocab_file.write('<UNK>\n') vocab_file.write('a\n') vocab_file.write('tricky\x0bchar\n') vocab_file.write('word\n') vocab_file.write('another\n') vocab = Vocabulary() vocab.set_from_file(vocab_filename, is_padded=True, oov_token="<UNK>") assert vocab._oov_token == DEFAULT_OOV_TOKEN assert vocab.get_token_index("random string") == 3 assert vocab.get_token_index("<S>") == 1 assert vocab.get_token_index("</S>") == 2 assert vocab.get_token_index(DEFAULT_OOV_TOKEN) == 3 assert vocab.get_token_index("a") == 4 assert vocab.get_token_index("tricky\x0bchar") == 5 assert vocab.get_token_index("word") == 6 assert vocab.get_token_index("another") == 7 assert vocab.get_token_from_index(0) == vocab._padding_token assert vocab.get_token_from_index(1) == "<S>" assert vocab.get_token_from_index(2) == "</S>" assert vocab.get_token_from_index(3) == DEFAULT_OOV_TOKEN assert vocab.get_token_from_index(4) == "a" assert vocab.get_token_from_index(5) == "tricky\x0bchar" assert vocab.get_token_from_index(6) == "word" assert vocab.get_token_from_index(7) == "another"
def test_add_word_to_index_gives_consistent_results(self): vocab = Vocabulary() initial_vocab_size = vocab.get_vocab_size() word_index = vocab.add_token_to_namespace("word") assert "word" in vocab.get_index_to_token_vocabulary().values() assert vocab.get_token_index("word") == word_index assert vocab.get_token_from_index(word_index) == "word" assert vocab.get_vocab_size() == initial_vocab_size + 1 # Now add it again, and make sure nothing changes. vocab.add_token_to_namespace("word") assert "word" in vocab.get_index_to_token_vocabulary().values() assert vocab.get_token_index("word") == word_index assert vocab.get_token_from_index(word_index) == "word" assert vocab.get_vocab_size() == initial_vocab_size + 1
def tokens_to_indices(self, tokens: List[Token], vocabulary: Vocabulary, index_name: str) -> Dict[str, List[int]]: tags = ['NONE' if not token.ent_type_ else token.ent_type_ for token in tokens] return {index_name: [vocabulary.get_token_index(tag, self._namespace) for tag in tags]}
def tokens_to_indices(self, tokens: List[Token], vocabulary: Vocabulary, index_name: str) -> Dict[str, List[int]]: dep_labels = [token.dep_ or 'NONE' for token in tokens] return {index_name: [vocabulary.get_token_index(dep_label, self.namespace) for dep_label in dep_labels]}
def token_to_indices(self, token: Token, vocabulary: Vocabulary) -> int: if self._coarse_tags: tag = token.pos_ else: tag = token.tag_ if tag is None: tag = 'NONE' return vocabulary.get_token_index(tag, self._namespace)
def test_namespaces(self): vocab = Vocabulary() initial_vocab_size = vocab.get_vocab_size() word_index = vocab.add_token_to_namespace("word", namespace='1') assert "word" in vocab.get_index_to_token_vocabulary(namespace='1').values() assert vocab.get_token_index("word", namespace='1') == word_index assert vocab.get_token_from_index(word_index, namespace='1') == "word" assert vocab.get_vocab_size(namespace='1') == initial_vocab_size + 1 # Now add it again, in a different namespace and a different word, and make sure it's like # new. word2_index = vocab.add_token_to_namespace("word2", namespace='2') word_index = vocab.add_token_to_namespace("word", namespace='2') assert "word" in vocab.get_index_to_token_vocabulary(namespace='2').values() assert "word2" in vocab.get_index_to_token_vocabulary(namespace='2').values() assert vocab.get_token_index("word", namespace='2') == word_index assert vocab.get_token_index("word2", namespace='2') == word2_index assert vocab.get_token_from_index(word_index, namespace='2') == "word" assert vocab.get_token_from_index(word2_index, namespace='2') == "word2" assert vocab.get_vocab_size(namespace='2') == initial_vocab_size + 2
def token_to_indices(self, token: Token, vocabulary: Vocabulary) -> int: if getattr(token, 'text_id', None) is not None: # `text_id` being set on the token means that we aren't using the vocab, we just use # this id instead. index = token.text_id else: text = token.text if self.lowercase_tokens: text = text.lower() index = vocabulary.get_token_index(text, self.namespace) return index
def _get_vocab_index_mapping(self, archived_vocab: Vocabulary) -> List[Tuple[int, int]]: vocab_index_mapping: List[Tuple[int, int]] = [] for index in range(self.vocab.get_vocab_size(namespace='tokens')): token = self.vocab.get_token_from_index(index=index, namespace='tokens') archived_token_index = archived_vocab.get_token_index(token, namespace='tokens') # Checking if we got the UNK token index, because we don't want all new token # representations initialized to UNK token's representation. We do that by checking if # the two tokens are the same. They will not be if the token at the archived index is # UNK. if archived_vocab.get_token_from_index(archived_token_index, namespace="tokens") == token: vocab_index_mapping.append((index, archived_token_index)) return vocab_index_mapping
def token_to_indices(self, token: Token, vocabulary: Vocabulary) -> List[int]: indices = [] if token.text is None: raise ConfigurationError('TokenCharactersIndexer needs a tokenizer that retains text') for character in self._character_tokenizer.tokenize(token.text): if getattr(character, 'text_id', None) is not None: # `text_id` being set on the token means that we aren't using the vocab, we just # use this id instead. index = character.text_id else: index = vocabulary.get_token_index(character.text, self._namespace) indices.append(index) return indices
def test_set_from_file_reads_non_padded_files(self): # pylint: disable=protected-access vocab_filename = self.TEST_DIR / 'vocab_file' with codecs.open(vocab_filename, 'w', 'utf-8') as vocab_file: vocab_file.write('B-PERS\n') vocab_file.write('I-PERS\n') vocab_file.write('O\n') vocab_file.write('B-ORG\n') vocab_file.write('I-ORG\n') vocab = Vocabulary() vocab.set_from_file(vocab_filename, is_padded=False, namespace='tags') assert vocab.get_token_index("B-PERS", namespace='tags') == 0 assert vocab.get_token_index("I-PERS", namespace='tags') == 1 assert vocab.get_token_index("O", namespace='tags') == 2 assert vocab.get_token_index("B-ORG", namespace='tags') == 3 assert vocab.get_token_index("I-ORG", namespace='tags') == 4 assert vocab.get_token_from_index(0, namespace='tags') == "B-PERS" assert vocab.get_token_from_index(1, namespace='tags') == "I-PERS" assert vocab.get_token_from_index(2, namespace='tags') == "O" assert vocab.get_token_from_index(3, namespace='tags') == "B-ORG" assert vocab.get_token_from_index(4, namespace='tags') == "I-ORG"
def test_get_embedding_layer_skips_inconsistent_lines(self): vocab = Vocabulary() vocab.add_token_to_namespace("word1") vocab.add_token_to_namespace("word2") embeddings_filename = self.TEST_DIR + "embeddings.gz" with gzip.open(embeddings_filename, 'wb') as embeddings_file: embeddings_file.write("word1 1.0 2.3 -1.0\n".encode('utf-8')) embeddings_file.write("word2 0.1 0.4 \n".encode('utf-8')) embedding_layer = get_pretrained_embedding_layer( embeddings_filename, vocab) word_vector = embedding_layer.weight.data[vocab.get_token_index( "word2")] assert not numpy.allclose(word_vector.numpy()[:2], numpy.array([0.1, 0.4]))
def token_to_indices(self, token: Token, vocabulary: Vocabulary) -> int: if getattr(token, 'text_id', None) is not None: # `text_id` being set on the token means that we aren't using the vocab, we just use # this id instead. index = token.text_id else: text = token.text if self.lowercase_tokens: text = text.lower() index = vocabulary.get_token_index(text, self.namespace) if index == 1: self.oov_count += 1 self.total_count += 1 return index
def __init__( self, vocab: Vocabulary, encoder: torch.nn.Module, decoder: torch.nn.Module, source_embedding: TokenEmbedder, target_embedding: TokenEmbedder, target_namespace: str = "target_tokens", start_symbol: str = '<GO>', eos_symbol: str = '<EOS>', max_decoding_step: int = 50, use_bleu: bool = True, label_smoothing: Optional[float] = None, ): super(ParallelSeq2Seq, self).__init__(vocab) self._encoder = encoder self._decoder = decoder self._src_embedding = source_embedding self._tgt_embedding = target_embedding self._start_id = vocab.get_token_index(start_symbol, target_namespace) self._eos_id = vocab.get_token_index(eos_symbol, target_namespace) self._max_decoding_step = max_decoding_step self._target_namespace = target_namespace self._label_smoothing = label_smoothing self._output_projection_layer = torch.nn.Linear( decoder.hidden_dim, vocab.get_vocab_size(target_namespace)) if use_bleu: pad_index = self.vocab.get_token_index(self.vocab._padding_token, self._target_namespace) self._bleu = BLEU( exclude_indices={pad_index, self._eos_id, self._start_id}) else: self._bleu = None
def evaluate_embeddings(embedding, vocab: Vocabulary): cosine = CosineSimilarity(dim=0) simlex999 = read_simlex999() sims_pred = [] oov_count = 0 for word1, word2, sim in simlex999: word1_id = vocab.get_token_index( word1, 'token_in') #word1_id takes the ID of the word 1. if word1_id == 1: # word_ID==1 means that that the word is out of vocabulary OOV sims_pred.append(0.) oov_count += 1 continue word2_id = vocab.get_token_index( word2, 'token_in') #word2_id takes the ID of the word 2 if word2_id == 1: sims_pred.append(0.) oov_count += 1 continue sim_pred = cosine( embedding.weight[word1_id], embedding.weight[word2_id] ).item( ) #Calculate the CosineSimilarity between word1 and word2 and charge this in sim_pred. sims_pred.append(sim_pred) assert len(sims_pred) == len( simlex999 ) # Assertion de l'egalité de longueur de sims_pred et simlex999 print('# of OOV words: {} / {}'.format(oov_count, len(simlex999))) print(pearsonr(sims_pred, [sim for _, _, sim in simlex999])) return spearmanr( sims_pred, [sim for _, _, sim in simlex999] ) # compare two sets of similarities and calculate how they are related, it's called spearman's correlation #compare two sets of similarities and calculate how they are related. #Calculates a Spearman rank-order correlation coefficient and the p-value to test for non-correlation. """scipy.stats.spearmanr(a, b=None, axis=0)[source]
def test_namespaces(self): vocab = Vocabulary() initial_vocab_size = vocab.get_vocab_size() word_index = vocab.add_token_to_namespace("word", namespace="1") assert "word" in vocab.get_index_to_token_vocabulary( namespace="1").values() assert vocab.get_token_index("word", namespace="1") == word_index assert vocab.get_token_from_index(word_index, namespace="1") == "word" assert vocab.get_vocab_size(namespace="1") == initial_vocab_size + 1 # Now add it again, in a different namespace and a different word, and make sure it's like # new. word2_index = vocab.add_token_to_namespace("word2", namespace="2") word_index = vocab.add_token_to_namespace("word", namespace="2") assert "word" in vocab.get_index_to_token_vocabulary( namespace="2").values() assert "word2" in vocab.get_index_to_token_vocabulary( namespace="2").values() assert vocab.get_token_index("word", namespace="2") == word_index assert vocab.get_token_index("word2", namespace="2") == word2_index assert vocab.get_token_from_index(word_index, namespace="2") == "word" assert vocab.get_token_from_index(word2_index, namespace="2") == "word2" assert vocab.get_vocab_size(namespace="2") == initial_vocab_size + 2
def _get_vocab_index_mapping(self, archived_vocab: Vocabulary) -> List[Tuple[int, int]]: vocab_index_mapping: List[Tuple[int, int]] = [] for index in range(self.vocab.get_vocab_size(namespace="tokens")): token = self.vocab.get_token_from_index(index=index, namespace="tokens") archived_token_index = archived_vocab.get_token_index(token, namespace="tokens") # Checking if we got the UNK token index, because we don't want all new token # representations initialized to UNK token's representation. We do that by checking if # the two tokens are the same. They will not be if the token at the archived index is # UNK. if ( archived_vocab.get_token_from_index(archived_token_index, namespace="tokens") == token ): vocab_index_mapping.append((index, archived_token_index)) return vocab_index_mapping
def token_to_indices(self, token: Token, vocabulary: Vocabulary) -> List[int]: indices = [] if token.text is None: raise ConfigurationError( 'TokenCharactersIndexer needs a tokenizer that retains text') for character in self._character_tokenizer.tokenize(token.text): if getattr(character, 'text_id', None) is not None: # `text_id` being set on the token means that we aren't using the vocab, we just # use this id instead. index = character.text_id else: index = vocabulary.get_token_index( character.text, self._namespace) indices.append(index) return indices
def get_synonyms(token: str, embedding: Model, vocab: Vocabulary, num_synonyms: int = 10): "Given a token, return a list of top N most similar words to the token" token_id = vocab.get_token_index(token, 'tags_in') token_vec = embedding.weight[token_id] cosine = CosineSimilarity(dim=0) sims = Counter() for index, token in vocab.get_index_to_token_vocabulary('tags_in').items(): sim = cosine(token_vec, embedding.weight[index]).item() sims[token] = sim return sims.most_common(num_synonyms)
def tokens_to_indices( self, tokens: List[Token], vocabulary: Vocabulary ) -> Dict[str, List[int]]: tags: List[str] = [] for token in tokens: if self._coarse_tags: tag = token.pos_ else: tag = token.tag_ if not tag: tag = "NONE" tags.append(tag) return {"tokens": [vocabulary.get_token_index(tag, self._namespace) for tag in tags]}
def __init__(self, embedder: TextFieldEmbedder, encoder: Seq2VecEncoder, vocab: Vocabulary, positive_label: str = '4') -> None: super().__init__(vocab) self.embedder = embedder self.encoder = encoder self.linear = torch.nn.Linear(in_features=encoder.get_output_dim(), out_features=vocab.get_vocab_size('labels')) positive_index = vocab.get_token_index(positive_label, namespace='labels') self.accuracy = CategoricalAccuracy() self.f1_measure = F1Measure(positive_index) self.loss_function = torch.nn.CrossEntropyLoss()
def tokens_to_indices(self, tokens: List[Token], vocabulary: Vocabulary, index_name: str) -> Dict[str, List[int]]: tags: List[str] = [] for token in tokens: if self._coarse_tags: tag = token.pos_ else: tag = token.tag_ if tag is None: tag = 'NONE' tags.append(tag) return {index_name: [vocabulary.get_token_index(tag, self._namespace) for tag in tags]}
def tokens_to_indices(self, tokens: List[Token], vocabulary: Vocabulary) -> Dict[str, List[int]]: indices: List[int] = [] for token in itertools.chain(self._start_tokens, tokens, self._end_tokens): text = self._get_feature_value(token) if self.namespace is None: # We could have a check here that `text` is an int; not sure it's worth it. indices.append(text) # type: ignore else: if self.lowercase_tokens: text = text.lower() indices.append(vocabulary.get_token_index( text, self.namespace)) return {"tokens": indices}
def _doc_bioul_to_spans(doc: List[str], vocab: Vocabulary) -> List[Tuple[int, int, int]]: '''Given bioul predictions of one document, return entities in the span format''' spans = [] for i, l in enumerate(doc): if l != 'O': span_label = l[2:] span_label_index = vocab.get_token_index( span_label, namespace='span_labels') # TODO: is this the right namespace? if l.startswith('U'): spans.append((i, i + 1, span_label_index)) elif l.startswith('B'): start_index = i elif l.startswith('L'): spans.append((start_index, i + 1, span_label_index)) return spans
def tokens_to_indices( self, tokens: List[Token], vocabulary: Vocabulary ) -> Dict[str, List[int]]: indices: List[int] = [] for token in itertools.chain(self._start_tokens, tokens, self._end_tokens): if getattr(token, "text_id", None) is not None: # `text_id` being set on the token means that we aren't using the vocab, we just use # this id instead. indices.append(token.text_id) else: text = token.text if self.lowercase_tokens: text = text.lower() indices.append(vocabulary.get_token_index(text, self.namespace)) return {"tokens": indices}
def tokens_to_indices(self, tokens: List[Token], vocabulary: Vocabulary, index_name: str) -> Dict[str, List[int]]: indices: List[int] = [] for token in tokens: if getattr(token, 'text_id', None) is not None: # `text_id` being set on the token means that we aren't using the vocab, # we just use this id instead. indices.append(token.text_id) else: text = token.text if self.lowercase_tokens: text = text.lower() indices.append(vocabulary.get_token_index( text, self.namespace)) return {index_name: indices}
def token_to_indices(self, token: Token, vocabulary: Vocabulary) -> List[int]: indices = [] if token.text is None: raise ConfigurationError( 'TokenBPEIndexer needs a tokenizer that retains text') for piece in self._bpe_tokenizer.tokenize(token.text): if getattr(piece, 'text_id', None) is not None: # `text_id` being set on the token means that we aren't using the vocab, we just # use this id instead. index = piece.text_id else: index = vocabulary.get_token_index(piece.text, self._namespace) indices.append(index) if index == 1: self.oov_count += 1 self.total_count += 1 return indices
def tokens_to_indices(self, tokens: List[Token], vocabulary: Vocabulary, index_name: str) -> Dict[str, List[int]]: indices: List[int] = [] for token in itertools.chain(self._start_tokens, tokens, self._end_tokens): if getattr(token, 'text_id', None) is not None: # `text_id` being set on the token means that we aren't using the vocab, we just use # this id instead. indices.append(token.text_id) else: text = token.text if self.lowercase_tokens: text = text.lower() indices.append(vocabulary.get_token_index(text, self.namespace)) return {index_name: indices}
def tokens_to_indices(self, tokens: List[Token], vocabulary: Vocabulary, index_name: str) -> Dict[str, List[List[int]]]: indices: List[List[int]] = [] for token in itertools.chain(self._start_tokens, tokens, self._end_tokens): token_indices: List[int] = [] if token.text is None: raise ConfigurationError('TokenCharactersIndexer needs a tokenizer that retains text') for character in self._character_tokenizer.tokenize(token.text): if getattr(character, 'text_id', None) is not None: # `text_id` being set on the token means that we aren't using the vocab, we just # use this id instead. index = character.text_id else: index = vocabulary.get_token_index(character.text, self._namespace) token_indices.append(index) indices.append(token_indices) return {index_name: indices}
def tokens_to_indices(self, tokens: List[Token], vocabulary: Vocabulary, index_name: str) -> Dict[str, List[List[int]]]: indices: List[List[int]] = [] for token in itertools.chain(self._start_tokens, tokens, self._end_tokens): token_indices: List[int] = [] if token.text is None: raise ConfigurationError('TokenCharactersIndexer needs a tokenizer that retains text') for character in self._character_tokenizer.tokenize(token.text): if getattr(character, 'text_id', None) is not None: # `text_id` being set on the token means that we aren't using the vocab, we just # use this id instead. index = character.text_id else: index = vocabulary.get_token_index(character.text, self._namespace) token_indices.append(index) indices.append(token_indices) return {index_name: indices}
def __init__(self, vocab: Vocabulary, embedder: TextFieldEmbedder, encoder: Seq2SeqEncoder) -> None: super().__init__(vocab) self._embedder = embedder self._encoder = encoder self._classifier = torch.nn.Linear( in_features=2 * encoder.get_output_dim(), out_features=vocab.get_vocab_size('labels')) #self._ffnn = torch.nn.Sequential( ##linear ##dropout ##tanh ##linear #) # define f1 here, use as plain F1 measure not spanBased self._metric = F1Measure(positive_label=vocab.get_token_index( token='positive', namespace='labels'))
def get_related(token: str, embedding: Model, vocab: Vocabulary, num_related: int = 20): """Given a token, return a list of top 20 most similar words to the token.""" token_id = vocab.get_token_index(token, 'token_in') token_vec = embedding.weight[ token_id] #A pre-initialization weight matrix for the embedding lookup, allowing the use of pretrained vectors. cosine = CosineSimilarity( dim=0 ) #we do this to be able calculate simple cosine similarity between 2 vectors sims = Counter() for index, token in vocab.get_index_to_token_vocabulary( 'token_in').items(): # Cosine similarity of our token vector with every other word vector in the vocabulary sim = cosine(token_vec, embedding.weight[index]).item() sims[token] = sim #save the value of cosine similarity return sims.most_common(num_related)
def tokens_to_indices(self, tokens: List[Token], vocabulary: Vocabulary, index_name: str) -> Dict[str, List[int]]: tags: List[str] = [] for token in tokens: if self._coarse_tags: tag = token.pos_ else: tag = token.tag_ if not tag: tag = 'NONE' tags.append(tag) temp = {index_name: [vocabulary.get_token_index(tag, self._namespace) for tag in tags]} #temp2 = [vocabulary.get_token_index(tag, self._namespace) for tag in tags] # import pdb; # pdb.set_trace() return temp
def tokens_to_indices(self, tokens: List[Token], vocabulary: Vocabulary, index_name: str) -> Dict[str, List[List[int]]]: vocab_size = vocabulary.get_vocab_size(self._namespace) # Initial steps are exactly the same as super().tokens_to_indices() indices: List[List[int]] = [] for token in itertools.chain(self._start_tokens, tokens, self._end_tokens): token_indices: List[int] = [] if token.text is None: raise ConfigurationError( 'BMETokenIndexer needs a tokenizer that retains text') for character in self._character_tokenizer.tokenize(token.text): if getattr(character, 'text_id', None) is not None: # `text_id` being set on the token means that we aren't using the vocab, we just # use this id instead. index = character.text_id else: index = vocabulary.get_token_index(character.text, self._namespace) token_indices.append(index) # Generating BME (steps that are different from super().tokens_to_indices()) B = F.one_hot(torch.tensor( self._pad(token_indices[:self._begin_size], self._begin_size, True)), num_classes=vocab_size).reshape(-1) M = F.one_hot(torch.tensor( [0] if len(token_indices) is 0 else token_indices), num_classes=vocab_size).sum(0) E = F.one_hot(torch.tensor( self._pad(token_indices[-self._end_size:], self._end_size)), num_classes=vocab_size).reshape(-1) indices.append(torch.cat((B, M, E)).tolist()) return {index_name: indices}
def test_from_params_valid_vocab_extension_thoroughly(self): ''' Tests for Valid Vocab Extension thoroughly: Vocab extension is valid when overlapping namespaces have same padding behaviour (padded/non-padded) Summary of namespace paddings in this test: original_vocab namespaces tokens0 padded tokens1 non-padded tokens2 padded tokens3 non-padded instances namespaces tokens0 padded tokens1 non-padded tokens4 padded tokens5 non-padded TypicalExtention example: (of tokens1 namespace) -> original_vocab index2token apple #0->apple bat #1->bat cat #2->cat -> Token to be extended with: cat, an, apple, banana, atom, bat -> extended_vocab: index2token apple #0->apple bat #1->bat cat #2->cat an #3->an atom #4->atom banana #5->banana ''' vocab_dir = self.TEST_DIR / 'vocab_save' original_vocab = Vocabulary(non_padded_namespaces=["tokens1", "tokens3"]) original_vocab.add_token_to_namespace("apple", namespace="tokens0") # index:2 original_vocab.add_token_to_namespace("bat", namespace="tokens0") # index:3 original_vocab.add_token_to_namespace("cat", namespace="tokens0") # index:4 original_vocab.add_token_to_namespace("apple", namespace="tokens1") # index:0 original_vocab.add_token_to_namespace("bat", namespace="tokens1") # index:1 original_vocab.add_token_to_namespace("cat", namespace="tokens1") # index:2 original_vocab.add_token_to_namespace("a", namespace="tokens2") # index:0 original_vocab.add_token_to_namespace("b", namespace="tokens2") # index:1 original_vocab.add_token_to_namespace("c", namespace="tokens2") # index:2 original_vocab.add_token_to_namespace("p", namespace="tokens3") # index:0 original_vocab.add_token_to_namespace("q", namespace="tokens3") # index:1 original_vocab.save_to_files(vocab_dir) text_field0 = TextField([Token(t) for t in ["cat", "an", "apple", "banana", "atom", "bat"]], {"tokens0": SingleIdTokenIndexer("tokens0")}) text_field1 = TextField([Token(t) for t in ["cat", "an", "apple", "banana", "atom", "bat"]], {"tokens1": SingleIdTokenIndexer("tokens1")}) text_field4 = TextField([Token(t) for t in ["l", "m", "n", "o"]], {"tokens4": SingleIdTokenIndexer("tokens4")}) text_field5 = TextField([Token(t) for t in ["x", "y", "z"]], {"tokens5": SingleIdTokenIndexer("tokens5")}) instances = Batch([Instance({"text0": text_field0, "text1": text_field1, "text4": text_field4, "text5": text_field5})]) params = Params({"directory_path": vocab_dir, "extend": True, "non_padded_namespaces": ["tokens1", "tokens5"]}) extended_vocab = Vocabulary.from_params(params, instances) # namespaces: tokens0, tokens1 is common. # tokens2, tokens3 only vocab has. tokens4, tokens5 only instances extended_namespaces = {*extended_vocab._token_to_index} assert extended_namespaces == {"tokens{}".format(i) for i in range(6)} # # Check that _non_padded_namespaces list is consistent after extension assert extended_vocab._non_padded_namespaces == {"tokens1", "tokens3", "tokens5"} # # original_vocab["tokens1"] has 3 tokens, instances of "tokens1" ns has 5 tokens. 2 overlapping assert extended_vocab.get_vocab_size("tokens1") == 6 assert extended_vocab.get_vocab_size("tokens0") == 8 # 2 extra overlapping because padded # namespace tokens3, tokens4 was only in original_vocab, # and its token count should be same in extended_vocab assert extended_vocab.get_vocab_size("tokens2") == original_vocab.get_vocab_size("tokens2") assert extended_vocab.get_vocab_size("tokens3") == original_vocab.get_vocab_size("tokens3") # namespace tokens2 was only in instances, # and its token count should be same in extended_vocab assert extended_vocab.get_vocab_size("tokens4") == 6 # l,m,n,o + oov + padding assert extended_vocab.get_vocab_size("tokens5") == 3 # x,y,z # Word2index mapping of all words in all namespaces of original_vocab # should be maintained in extended_vocab for namespace, token2index in original_vocab._token_to_index.items(): for token, _ in token2index.items(): vocab_index = original_vocab.get_token_index(token, namespace) extended_vocab_index = extended_vocab.get_token_index(token, namespace) assert vocab_index == extended_vocab_index # And same for Index2Word mapping for namespace, index2token in original_vocab._index_to_token.items(): for index, _ in index2token.items(): vocab_token = original_vocab.get_token_from_index(index, namespace) extended_vocab_token = extended_vocab.get_token_from_index(index, namespace) assert vocab_token == extended_vocab_token
# coding=utf-8 # @Author: 莫冉 # @Date: 2020-08-06 from allennlp.data.vocabulary import Vocabulary vocab_file = "../data/base_bert/vocab.txt" save_path = "../../../vocab_path" vocab = Vocabulary(padding_token="[PAD]", oov_token="[UNK]") vocab.set_from_file(vocab_file, is_padded=True, oov_token="[UNK]") vocab.save_to_files(save_path) print(vocab.get_token_index(vocab._oov_token))
def __init__( self, vocab: Vocabulary, source_embedder: TextFieldEmbedder, encoder: Seq2VecEncoder, kg_encoder: Seq2VecEncoder, max_decoding_steps: int = 64, attention: Attention = None, target_namespace: str = "tokens", scheduled_sampling_ratio: float = 0.4, ) -> None: super().__init__(vocab) self._target_namespace = target_namespace self._scheduled_sampling_ratio = scheduled_sampling_ratio # Maybe we can try self._start_index = self.vocab.get_token_index(START_SYMBOL, self._target_namespace) self._end_index = self.vocab.get_token_index(END_SYMBOL, self._target_namespace) self.pad_index = self.vocab.get_token_index(self.vocab._padding_token, self._target_namespace) self.hidden_dim = 300 self._max_decoding_steps = max_decoding_steps self.kd_metric = KD_Metric() self.bleu_aver = NLTK_BLEU(ngram_weights=(0.25, 0.25, 0.25, 0.25)) self.bleu1 = NLTK_BLEU(ngram_weights=(1, 0, 0, 0)) self.bleu2 = NLTK_BLEU(ngram_weights=(0, 1, 0, 0)) self.bleu4 = NLTK_BLEU(ngram_weights=(0, 0, 0, 1)) self.topic_acc = Average() self.distinct1 = Distinct1() self.distinct2 = Distinct2() # anything about module self._source_embedder = source_embedder num_classes = self.vocab.get_vocab_size(self._target_namespace) target_embedding_dim = source_embedder.get_output_dim() self._target_embedder = Embedding(num_classes, target_embedding_dim) self._encoder = encoder self._kg_encoder = kg_encoder self._encoder_output_dim = self._encoder.get_output_dim() self._decoder_output_dim = self._encoder_output_dim # self.select_entity_num = 3 self._decoder_input_dim = self.hidden_dim * 2 + total_entiy #self.select_entity_num self._attention = None if attention: self._attention = attention self._decoder_input_dim = self._decoder_output_dim + target_embedding_dim self._decoder_cell = LSTMCell(self.hidden_dim * 2, self._decoder_output_dim) self._output_projection_layer = Linear(self.hidden_dim, num_classes) # with open('cy/comp_topic2num.pk', 'rb') as f: with open('fd/word2idx.pk', 'rb') as f: self.word_idx = pickle.load(f) self.vocab_to_idx = {} self.idx_to_vocab_list = [] for word, k in self.word_idx.items(): self.vocab_to_idx[vocab.get_token_index(word.strip())] = k self.idx_to_vocab_list.append(vocab.get_token_index(word.strip())) self.entity_size = total_entiy self.entity_embedding = torch.nn.Parameter( torch.Tensor(self.entity_size, self.hidden_dim)) torch.nn.init.xavier_uniform_(self.entity_embedding, gain=1.414) self.entity_linear = Linear(self.hidden_dim * 2, self.entity_size) self.gen_linear = Linear(self.hidden_dim, 1) self.clac_num = 0
def token_to_indices(self, token: Token, vocabulary: Vocabulary) -> int: dep_label = token.dep_ or 'NONE' return vocabulary.get_token_index(dep_label, self.namespace)
def index(self, vocab: Vocabulary): self._mapping_array = [vocab.get_token_index(x.text, self._target_namespace) for x in self._source_tokens]
def index(self, vocab: Vocabulary): if self._indexed_labels is None: self._indexed_labels = [vocab.get_token_index(label, self._label_namespace) # type: ignore for label in self.labels]
def __init__(self, vocab: Vocabulary, token_embedder: TextFieldEmbedder, entity_embedder: TextFieldEmbedder, relation_embedder: TextFieldEmbedder, knowledge_graph_path: str, use_shortlist: bool, hidden_size: int, num_layers: int, cutoff: int = 30, tie_weights: bool = False, dropout: float = 0.4, dropouth: float = 0.3, dropouti: float = 0.65, dropoute: float = 0.1, wdrop: float = 0.5, alpha: float = 2.0, beta: float = 1.0, initializer: InitializerApplicator = InitializerApplicator()) -> None: super(KglmDisc, self).__init__(vocab) # We extract the `Embedding` layers from the `TokenEmbedders` to apply dropout later on. # pylint: disable=protected-access self._token_embedder = token_embedder._token_embedders['tokens'] self._entity_embedder = entity_embedder._token_embedders['entity_ids'] self._relation_embedder = relation_embedder._token_embedders['relations'] self._recent_entities = RecentEntities(cutoff=cutoff) self._knowledge_graph_lookup = KnowledgeGraphLookup(knowledge_graph_path, vocab=vocab) self._use_shortlist = use_shortlist self._hidden_size = hidden_size self._num_layers = num_layers self._cutoff = cutoff self._tie_weights = tie_weights # Dropout self._locked_dropout = LockedDropout() self._dropout = dropout self._dropouth = dropouth self._dropouti = dropouti self._dropoute = dropoute self._wdrop = wdrop # Regularization strength self._alpha = alpha self._beta = beta # RNN Encoders. entity_embedding_dim = entity_embedder.get_output_dim() token_embedding_dim = token_embedder.get_output_dim() self.entity_embedding_dim = entity_embedding_dim self.token_embedding_dim = token_embedding_dim rnns: List[torch.nn.Module] = [] for i in range(num_layers): if i == 0: input_size = token_embedding_dim else: input_size = hidden_size if i == num_layers - 1: output_size = token_embedding_dim + 2 * entity_embedding_dim else: output_size = hidden_size rnns.append(torch.nn.LSTM(input_size, output_size, batch_first=True)) rnns = [WeightDrop(rnn, ['weight_hh_l0'], dropout=wdrop) for rnn in rnns] self.rnns = torch.nn.ModuleList(rnns) # Various linear transformations. self._fc_mention_type = torch.nn.Linear( in_features=token_embedding_dim, out_features=4) if not use_shortlist: self._fc_new_entity = torch.nn.Linear( in_features=entity_embedding_dim, out_features=vocab.get_vocab_size('entity_ids')) if tie_weights: self._fc_new_entity.weight = self._entity_embedder.weight self._state: Optional[Dict[str, Any]] = None # Metrics self._unk_index = vocab.get_token_index(DEFAULT_OOV_TOKEN) self._unk_penalty = math.log(vocab.get_vocab_size('tokens_unk')) self._avg_mention_type_loss = Average() self._avg_new_entity_loss = Average() self._avg_knowledge_graph_entity_loss = Average() self._new_mention_f1 = F1Measure(positive_label=1) self._kg_mention_f1 = F1Measure(positive_label=2) self._new_entity_accuracy = CategoricalAccuracy() self._new_entity_accuracy20 = CategoricalAccuracy(top_k=20) self._parent_ppl = Ppl() self._relation_ppl = Ppl() initializer(self)
def index(self, vocab: Vocabulary): if self._label_id is None: self._label_id = vocab.get_token_index(self.label, self._label_namespace) # type: ignore
def index(self, vocab: Vocabulary): if self.is_global_rule and self._rule_id is None: self._rule_id = vocab.get_token_index(self.rule, self._vocab_namespace)
def index(self, vocab: Vocabulary): if self.labels is not None: self._indexed_labels = [ vocab.get_token_index(label, self._label_namespace) for label in self.labels ]
def index(self, vocab: Vocabulary): if self._label_id is None: self._label_id = vocab.get_token_index(self.label, self._label_namespace) # type: ignore
def test_from_params_valid_vocab_extension_thoroughly(self): """ Tests for Valid Vocab Extension thoroughly: Vocab extension is valid when overlapping namespaces have same padding behaviour (padded/non-padded) Summary of namespace paddings in this test: original_vocab namespaces tokens0 padded tokens1 non-padded tokens2 padded tokens3 non-padded instances namespaces tokens0 padded tokens1 non-padded tokens4 padded tokens5 non-padded TypicalExtention example: (of tokens1 namespace) -> original_vocab index2token apple #0->apple bat #1->bat cat #2->cat -> Token to be extended with: cat, an, apple, banana, atom, bat -> extended_vocab: index2token apple #0->apple bat #1->bat cat #2->cat an #3->an atom #4->atom banana #5->banana """ vocab_dir = self.TEST_DIR / "vocab_save" original_vocab = Vocabulary( non_padded_namespaces=["tokens1", "tokens3"]) original_vocab.add_token_to_namespace("apple", namespace="tokens0") # index:2 original_vocab.add_token_to_namespace("bat", namespace="tokens0") # index:3 original_vocab.add_token_to_namespace("cat", namespace="tokens0") # index:4 original_vocab.add_token_to_namespace("apple", namespace="tokens1") # index:0 original_vocab.add_token_to_namespace("bat", namespace="tokens1") # index:1 original_vocab.add_token_to_namespace("cat", namespace="tokens1") # index:2 original_vocab.add_token_to_namespace("a", namespace="tokens2") # index:0 original_vocab.add_token_to_namespace("b", namespace="tokens2") # index:1 original_vocab.add_token_to_namespace("c", namespace="tokens2") # index:2 original_vocab.add_token_to_namespace("p", namespace="tokens3") # index:0 original_vocab.add_token_to_namespace("q", namespace="tokens3") # index:1 original_vocab.save_to_files(vocab_dir) text_field0 = TextField( [ Token(t) for t in ["cat", "an", "apple", "banana", "atom", "bat"] ], {"tokens0": SingleIdTokenIndexer("tokens0")}, ) text_field1 = TextField( [ Token(t) for t in ["cat", "an", "apple", "banana", "atom", "bat"] ], {"tokens1": SingleIdTokenIndexer("tokens1")}, ) text_field4 = TextField([Token(t) for t in ["l", "m", "n", "o"]], {"tokens4": SingleIdTokenIndexer("tokens4")}) text_field5 = TextField([Token(t) for t in ["x", "y", "z"]], {"tokens5": SingleIdTokenIndexer("tokens5")}) instances = Batch([ Instance({ "text0": text_field0, "text1": text_field1, "text4": text_field4, "text5": text_field5, }) ]) params = Params({ "type": "extend", "directory": vocab_dir, "non_padded_namespaces": ["tokens1", "tokens5"], }) extended_vocab = Vocabulary.from_params(params, instances=instances) # namespaces: tokens0, tokens1 is common. # tokens2, tokens3 only vocab has. tokens4, tokens5 only instances extended_namespaces = {*extended_vocab._token_to_index} assert extended_namespaces == {"tokens{}".format(i) for i in range(6)} # # Check that _non_padded_namespaces list is consistent after extension assert extended_vocab._non_padded_namespaces == { "tokens1", "tokens3", "tokens5" } # # original_vocab["tokens1"] has 3 tokens, instances of "tokens1" ns has 5 tokens. 2 overlapping assert extended_vocab.get_vocab_size("tokens1") == 6 assert extended_vocab.get_vocab_size( "tokens0") == 8 # 2 extra overlapping because padded # namespace tokens3, tokens4 was only in original_vocab, # and its token count should be same in extended_vocab assert extended_vocab.get_vocab_size( "tokens2") == original_vocab.get_vocab_size("tokens2") assert extended_vocab.get_vocab_size( "tokens3") == original_vocab.get_vocab_size("tokens3") # namespace tokens2 was only in instances, # and its token count should be same in extended_vocab assert extended_vocab.get_vocab_size( "tokens4") == 6 # l,m,n,o + oov + padding assert extended_vocab.get_vocab_size("tokens5") == 3 # x,y,z # Word2index mapping of all words in all namespaces of original_vocab # should be maintained in extended_vocab for namespace, token2index in original_vocab._token_to_index.items(): for token, _ in token2index.items(): vocab_index = original_vocab.get_token_index(token, namespace) extended_vocab_index = extended_vocab.get_token_index( token, namespace) assert vocab_index == extended_vocab_index # And same for Index2Word mapping for namespace, index2token in original_vocab._index_to_token.items(): for index, _ in index2token.items(): vocab_token = original_vocab.get_token_from_index( index, namespace) extended_vocab_token = extended_vocab.get_token_from_index( index, namespace) assert vocab_token == extended_vocab_token
def index(self, vocab: Vocabulary): if self._label_ids is None: self._label_ids = [vocab.get_token_index(label, self._label_namespace) # type: ignore for label in self.labels] if not self._num_labels: self._num_labels = vocab.get_vocab_size(self._label_namespace)
def index(self, vocab: Vocabulary): if self.is_global_rule and self._rule_id is None: self._rule_id = vocab.get_token_index(self.rule, self._vocab_namespace)