Exemplo n.º 1
0
 def test_unknown_token(self):
     # pylint: disable=protected-access
     # We're putting this behavior in a test so that the behavior is documented.  There is
     # solver code that depends in a small way on how we treat the unknown token, so any
     # breaking change to this behavior should break a test, so you know you've done something
     # that needs more consideration.
     vocab = Vocabulary()
     oov_token = vocab._oov_token
     oov_index = vocab.get_token_index(oov_token)
     assert oov_index == 1
     assert vocab.get_token_index("unseen word") == oov_index
Exemplo n.º 2
0
    def test_set_from_file_reads_padded_files(self):
        # pylint: disable=protected-access
        vocab_filename = self.TEST_DIR / 'vocab_file'
        with codecs.open(vocab_filename, 'w', 'utf-8') as vocab_file:
            vocab_file.write('<S>\n')
            vocab_file.write('</S>\n')
            vocab_file.write('<UNK>\n')
            vocab_file.write('a\n')
            vocab_file.write('tricky\x0bchar\n')
            vocab_file.write('word\n')
            vocab_file.write('another\n')

        vocab = Vocabulary()
        vocab.set_from_file(vocab_filename, is_padded=True, oov_token="<UNK>")

        assert vocab._oov_token == DEFAULT_OOV_TOKEN
        assert vocab.get_token_index("random string") == 3
        assert vocab.get_token_index("<S>") == 1
        assert vocab.get_token_index("</S>") == 2
        assert vocab.get_token_index(DEFAULT_OOV_TOKEN) == 3
        assert vocab.get_token_index("a") == 4
        assert vocab.get_token_index("tricky\x0bchar") == 5
        assert vocab.get_token_index("word") == 6
        assert vocab.get_token_index("another") == 7
        assert vocab.get_token_from_index(0) == vocab._padding_token
        assert vocab.get_token_from_index(1) == "<S>"
        assert vocab.get_token_from_index(2) == "</S>"
        assert vocab.get_token_from_index(3) == DEFAULT_OOV_TOKEN
        assert vocab.get_token_from_index(4) == "a"
        assert vocab.get_token_from_index(5) == "tricky\x0bchar"
        assert vocab.get_token_from_index(6) == "word"
        assert vocab.get_token_from_index(7) == "another"
Exemplo n.º 3
0
    def test_add_word_to_index_gives_consistent_results(self):
        vocab = Vocabulary()
        initial_vocab_size = vocab.get_vocab_size()
        word_index = vocab.add_token_to_namespace("word")
        assert "word" in vocab.get_index_to_token_vocabulary().values()
        assert vocab.get_token_index("word") == word_index
        assert vocab.get_token_from_index(word_index) == "word"
        assert vocab.get_vocab_size() == initial_vocab_size + 1

        # Now add it again, and make sure nothing changes.
        vocab.add_token_to_namespace("word")
        assert "word" in vocab.get_index_to_token_vocabulary().values()
        assert vocab.get_token_index("word") == word_index
        assert vocab.get_token_from_index(word_index) == "word"
        assert vocab.get_vocab_size() == initial_vocab_size + 1
Exemplo n.º 4
0
    def tokens_to_indices(self,
                          tokens: List[Token],
                          vocabulary: Vocabulary,
                          index_name: str) -> Dict[str, List[int]]:
        tags = ['NONE' if not token.ent_type_ else token.ent_type_ for token in tokens]

        return {index_name: [vocabulary.get_token_index(tag, self._namespace) for tag in tags]}
Exemplo n.º 5
0
    def tokens_to_indices(self,
                          tokens: List[Token],
                          vocabulary: Vocabulary,
                          index_name: str) -> Dict[str, List[int]]:
        dep_labels = [token.dep_ or 'NONE' for token in tokens]

        return {index_name: [vocabulary.get_token_index(dep_label, self.namespace) for dep_label in dep_labels]}
Exemplo n.º 6
0
 def token_to_indices(self, token: Token, vocabulary: Vocabulary) -> int:
     if self._coarse_tags:
         tag = token.pos_
     else:
         tag = token.tag_
     if tag is None:
         tag = 'NONE'
     return vocabulary.get_token_index(tag, self._namespace)
Exemplo n.º 7
0
    def test_namespaces(self):
        vocab = Vocabulary()
        initial_vocab_size = vocab.get_vocab_size()
        word_index = vocab.add_token_to_namespace("word", namespace='1')
        assert "word" in vocab.get_index_to_token_vocabulary(namespace='1').values()
        assert vocab.get_token_index("word", namespace='1') == word_index
        assert vocab.get_token_from_index(word_index, namespace='1') == "word"
        assert vocab.get_vocab_size(namespace='1') == initial_vocab_size + 1

        # Now add it again, in a different namespace and a different word, and make sure it's like
        # new.
        word2_index = vocab.add_token_to_namespace("word2", namespace='2')
        word_index = vocab.add_token_to_namespace("word", namespace='2')
        assert "word" in vocab.get_index_to_token_vocabulary(namespace='2').values()
        assert "word2" in vocab.get_index_to_token_vocabulary(namespace='2').values()
        assert vocab.get_token_index("word", namespace='2') == word_index
        assert vocab.get_token_index("word2", namespace='2') == word2_index
        assert vocab.get_token_from_index(word_index, namespace='2') == "word"
        assert vocab.get_token_from_index(word2_index, namespace='2') == "word2"
        assert vocab.get_vocab_size(namespace='2') == initial_vocab_size + 2
 def token_to_indices(self, token: Token, vocabulary: Vocabulary) -> int:
     if getattr(token, 'text_id', None) is not None:
         # `text_id` being set on the token means that we aren't using the vocab, we just use
         # this id instead.
         index = token.text_id
     else:
         text = token.text
         if self.lowercase_tokens:
             text = text.lower()
         index = vocabulary.get_token_index(text, self.namespace)
     return index
 def _get_vocab_index_mapping(self, archived_vocab: Vocabulary) -> List[Tuple[int, int]]:
     vocab_index_mapping: List[Tuple[int, int]] = []
     for index in range(self.vocab.get_vocab_size(namespace='tokens')):
         token = self.vocab.get_token_from_index(index=index, namespace='tokens')
         archived_token_index = archived_vocab.get_token_index(token, namespace='tokens')
         # Checking if we got the UNK token index, because we don't want all new token
         # representations initialized to UNK token's representation. We do that by checking if
         # the two tokens are the same. They will not be if the token at the archived index is
         # UNK.
         if archived_vocab.get_token_from_index(archived_token_index, namespace="tokens") == token:
             vocab_index_mapping.append((index, archived_token_index))
     return vocab_index_mapping
 def token_to_indices(self, token: Token, vocabulary: Vocabulary) -> List[int]:
     indices = []
     if token.text is None:
         raise ConfigurationError('TokenCharactersIndexer needs a tokenizer that retains text')
     for character in self._character_tokenizer.tokenize(token.text):
         if getattr(character, 'text_id', None) is not None:
             # `text_id` being set on the token means that we aren't using the vocab, we just
             # use this id instead.
             index = character.text_id
         else:
             index = vocabulary.get_token_index(character.text, self._namespace)
         indices.append(index)
     return indices
Exemplo n.º 11
0
    def test_set_from_file_reads_non_padded_files(self):
        # pylint: disable=protected-access
        vocab_filename = self.TEST_DIR / 'vocab_file'
        with codecs.open(vocab_filename, 'w', 'utf-8') as vocab_file:
            vocab_file.write('B-PERS\n')
            vocab_file.write('I-PERS\n')
            vocab_file.write('O\n')
            vocab_file.write('B-ORG\n')
            vocab_file.write('I-ORG\n')

        vocab = Vocabulary()
        vocab.set_from_file(vocab_filename, is_padded=False, namespace='tags')
        assert vocab.get_token_index("B-PERS", namespace='tags') == 0
        assert vocab.get_token_index("I-PERS", namespace='tags') == 1
        assert vocab.get_token_index("O", namespace='tags') == 2
        assert vocab.get_token_index("B-ORG", namespace='tags') == 3
        assert vocab.get_token_index("I-ORG", namespace='tags') == 4
        assert vocab.get_token_from_index(0, namespace='tags') == "B-PERS"
        assert vocab.get_token_from_index(1, namespace='tags') == "I-PERS"
        assert vocab.get_token_from_index(2, namespace='tags') == "O"
        assert vocab.get_token_from_index(3, namespace='tags') == "B-ORG"
        assert vocab.get_token_from_index(4, namespace='tags') == "I-ORG"
Exemplo n.º 12
0
 def test_get_embedding_layer_skips_inconsistent_lines(self):
     vocab = Vocabulary()
     vocab.add_token_to_namespace("word1")
     vocab.add_token_to_namespace("word2")
     embeddings_filename = self.TEST_DIR + "embeddings.gz"
     with gzip.open(embeddings_filename, 'wb') as embeddings_file:
         embeddings_file.write("word1 1.0 2.3 -1.0\n".encode('utf-8'))
         embeddings_file.write("word2 0.1 0.4 \n".encode('utf-8'))
     embedding_layer = get_pretrained_embedding_layer(
         embeddings_filename, vocab)
     word_vector = embedding_layer.weight.data[vocab.get_token_index(
         "word2")]
     assert not numpy.allclose(word_vector.numpy()[:2],
                               numpy.array([0.1, 0.4]))
Exemplo n.º 13
0
 def token_to_indices(self, token: Token, vocabulary: Vocabulary) -> int:
     if getattr(token, 'text_id', None) is not None:
         # `text_id` being set on the token means that we aren't using the vocab, we just use
         # this id instead.
         index = token.text_id
     else:
         text = token.text
         if self.lowercase_tokens:
             text = text.lower()
         index = vocabulary.get_token_index(text, self.namespace)
     if index == 1:
         self.oov_count += 1
     self.total_count += 1
     return index
Exemplo n.º 14
0
    def __init__(
        self,
        vocab: Vocabulary,
        encoder: torch.nn.Module,
        decoder: torch.nn.Module,
        source_embedding: TokenEmbedder,
        target_embedding: TokenEmbedder,
        target_namespace: str = "target_tokens",
        start_symbol: str = '<GO>',
        eos_symbol: str = '<EOS>',
        max_decoding_step: int = 50,
        use_bleu: bool = True,
        label_smoothing: Optional[float] = None,
    ):
        super(ParallelSeq2Seq, self).__init__(vocab)
        self._encoder = encoder
        self._decoder = decoder
        self._src_embedding = source_embedding
        self._tgt_embedding = target_embedding

        self._start_id = vocab.get_token_index(start_symbol, target_namespace)
        self._eos_id = vocab.get_token_index(eos_symbol, target_namespace)
        self._max_decoding_step = max_decoding_step

        self._target_namespace = target_namespace
        self._label_smoothing = label_smoothing

        self._output_projection_layer = torch.nn.Linear(
            decoder.hidden_dim, vocab.get_vocab_size(target_namespace))

        if use_bleu:
            pad_index = self.vocab.get_token_index(self.vocab._padding_token,
                                                   self._target_namespace)
            self._bleu = BLEU(
                exclude_indices={pad_index, self._eos_id, self._start_id})
        else:
            self._bleu = None
Exemplo n.º 15
0
def evaluate_embeddings(embedding, vocab: Vocabulary):
    cosine = CosineSimilarity(dim=0)
    simlex999 = read_simlex999()
    sims_pred = []
    oov_count = 0
    for word1, word2, sim in simlex999:
        word1_id = vocab.get_token_index(
            word1, 'token_in')  #word1_id takes the ID of the word 1.
        if word1_id == 1:  # word_ID==1  means that that the word is out of vocabulary OOV
            sims_pred.append(0.)
            oov_count += 1
            continue
        word2_id = vocab.get_token_index(
            word2, 'token_in')  #word2_id takes the ID of the word 2
        if word2_id == 1:
            sims_pred.append(0.)
            oov_count += 1
            continue

        sim_pred = cosine(
            embedding.weight[word1_id], embedding.weight[word2_id]
        ).item(
        )  #Calculate the CosineSimilarity between word1 and word2 and charge this in sim_pred.
        sims_pred.append(sim_pred)

    assert len(sims_pred) == len(
        simlex999
    )  # Assertion de l'egalité de longueur de sims_pred et simlex999
    print('# of OOV words: {} / {}'.format(oov_count, len(simlex999)))
    print(pearsonr(sims_pred, [sim for _, _, sim in simlex999]))
    return spearmanr(
        sims_pred, [sim for _, _, sim in simlex999]
    )  # compare two sets of similarities and calculate how they are related, it's called spearman's correlation
    #compare two sets of similarities and calculate how they are related.
    #Calculates a Spearman rank-order correlation coefficient and the p-value to test for non-correlation.
    """scipy.stats.spearmanr(a, b=None, axis=0)[source]
Exemplo n.º 16
0
    def test_namespaces(self):
        vocab = Vocabulary()
        initial_vocab_size = vocab.get_vocab_size()
        word_index = vocab.add_token_to_namespace("word", namespace="1")
        assert "word" in vocab.get_index_to_token_vocabulary(
            namespace="1").values()
        assert vocab.get_token_index("word", namespace="1") == word_index
        assert vocab.get_token_from_index(word_index, namespace="1") == "word"
        assert vocab.get_vocab_size(namespace="1") == initial_vocab_size + 1

        # Now add it again, in a different namespace and a different word, and make sure it's like
        # new.
        word2_index = vocab.add_token_to_namespace("word2", namespace="2")
        word_index = vocab.add_token_to_namespace("word", namespace="2")
        assert "word" in vocab.get_index_to_token_vocabulary(
            namespace="2").values()
        assert "word2" in vocab.get_index_to_token_vocabulary(
            namespace="2").values()
        assert vocab.get_token_index("word", namespace="2") == word_index
        assert vocab.get_token_index("word2", namespace="2") == word2_index
        assert vocab.get_token_from_index(word_index, namespace="2") == "word"
        assert vocab.get_token_from_index(word2_index,
                                          namespace="2") == "word2"
        assert vocab.get_vocab_size(namespace="2") == initial_vocab_size + 2
Exemplo n.º 17
0
 def _get_vocab_index_mapping(self, archived_vocab: Vocabulary) -> List[Tuple[int, int]]:
     vocab_index_mapping: List[Tuple[int, int]] = []
     for index in range(self.vocab.get_vocab_size(namespace="tokens")):
         token = self.vocab.get_token_from_index(index=index, namespace="tokens")
         archived_token_index = archived_vocab.get_token_index(token, namespace="tokens")
         # Checking if we got the UNK token index, because we don't want all new token
         # representations initialized to UNK token's representation. We do that by checking if
         # the two tokens are the same. They will not be if the token at the archived index is
         # UNK.
         if (
             archived_vocab.get_token_from_index(archived_token_index, namespace="tokens")
             == token
         ):
             vocab_index_mapping.append((index, archived_token_index))
     return vocab_index_mapping
Exemplo n.º 18
0
 def token_to_indices(self, token: Token, vocabulary: Vocabulary) -> List[int]:
     indices = []
     if token.text is None:
         raise ConfigurationError(
             'TokenCharactersIndexer needs a tokenizer that retains text')
     for character in self._character_tokenizer.tokenize(token.text):
         if getattr(character, 'text_id', None) is not None:
             # `text_id` being set on the token means that we aren't using the vocab, we just
             # use this id instead.
             index = character.text_id
         else:
             index = vocabulary.get_token_index(
                 character.text, self._namespace)
         indices.append(index)
     return indices
Exemplo n.º 19
0
def get_synonyms(token: str,
                 embedding: Model,
                 vocab: Vocabulary,
                 num_synonyms: int = 10):
    "Given a token, return a list of top N most similar words to the token"
    token_id = vocab.get_token_index(token, 'tags_in')
    token_vec = embedding.weight[token_id]

    cosine = CosineSimilarity(dim=0)
    sims = Counter()

    for index, token in vocab.get_index_to_token_vocabulary('tags_in').items():
        sim = cosine(token_vec, embedding.weight[index]).item()
        sims[token] = sim

    return sims.most_common(num_synonyms)
Exemplo n.º 20
0
    def tokens_to_indices(
        self, tokens: List[Token], vocabulary: Vocabulary
    ) -> Dict[str, List[int]]:
        tags: List[str] = []

        for token in tokens:
            if self._coarse_tags:
                tag = token.pos_
            else:
                tag = token.tag_
            if not tag:
                tag = "NONE"

            tags.append(tag)

        return {"tokens": [vocabulary.get_token_index(tag, self._namespace) for tag in tags]}
    def __init__(self,
                 embedder: TextFieldEmbedder,
                 encoder: Seq2VecEncoder,
                 vocab: Vocabulary,
                 positive_label: str = '4') -> None:
        super().__init__(vocab)
        self.embedder = embedder
        self.encoder = encoder
        self.linear = torch.nn.Linear(in_features=encoder.get_output_dim(),
                                      out_features=vocab.get_vocab_size('labels'))

        positive_index = vocab.get_token_index(positive_label, namespace='labels')
        self.accuracy = CategoricalAccuracy()
        self.f1_measure = F1Measure(positive_index)

        self.loss_function = torch.nn.CrossEntropyLoss()
    def tokens_to_indices(self,
                          tokens: List[Token],
                          vocabulary: Vocabulary,
                          index_name: str) -> Dict[str, List[int]]:
        tags: List[str] = []

        for token in tokens:
            if self._coarse_tags:
                tag = token.pos_
            else:
                tag = token.tag_
            if tag is None:
                tag = 'NONE'

            tags.append(tag)

        return {index_name: [vocabulary.get_token_index(tag, self._namespace) for tag in tags]}
Exemplo n.º 23
0
    def tokens_to_indices(self, tokens: List[Token],
                          vocabulary: Vocabulary) -> Dict[str, List[int]]:
        indices: List[int] = []

        for token in itertools.chain(self._start_tokens, tokens,
                                     self._end_tokens):
            text = self._get_feature_value(token)
            if self.namespace is None:
                # We could have a check here that `text` is an int; not sure it's worth it.
                indices.append(text)  # type: ignore
            else:
                if self.lowercase_tokens:
                    text = text.lower()
                indices.append(vocabulary.get_token_index(
                    text, self.namespace))

        return {"tokens": indices}
Exemplo n.º 24
0
def _doc_bioul_to_spans(doc: List[str],
                        vocab: Vocabulary) -> List[Tuple[int, int, int]]:
    '''Given bioul predictions of one document, return entities in the span format'''
    spans = []
    for i, l in enumerate(doc):
        if l != 'O':
            span_label = l[2:]
            span_label_index = vocab.get_token_index(
                span_label,
                namespace='span_labels')  # TODO: is this the right namespace?
        if l.startswith('U'):
            spans.append((i, i + 1, span_label_index))
        elif l.startswith('B'):
            start_index = i
        elif l.startswith('L'):
            spans.append((start_index, i + 1, span_label_index))
    return spans
Exemplo n.º 25
0
    def tokens_to_indices(
        self, tokens: List[Token], vocabulary: Vocabulary
    ) -> Dict[str, List[int]]:
        indices: List[int] = []

        for token in itertools.chain(self._start_tokens, tokens, self._end_tokens):
            if getattr(token, "text_id", None) is not None:
                # `text_id` being set on the token means that we aren't using the vocab, we just use
                # this id instead.
                indices.append(token.text_id)
            else:
                text = token.text
                if self.lowercase_tokens:
                    text = text.lower()
                indices.append(vocabulary.get_token_index(text, self.namespace))

        return {"tokens": indices}
Exemplo n.º 26
0
    def tokens_to_indices(self, tokens: List[Token], vocabulary: Vocabulary,
                          index_name: str) -> Dict[str, List[int]]:
        indices: List[int] = []

        for token in tokens:
            if getattr(token, 'text_id', None) is not None:
                # `text_id` being set on the token means that we aren't using the vocab,
                # we just use this id instead.
                indices.append(token.text_id)
            else:
                text = token.text
                if self.lowercase_tokens:
                    text = text.lower()
                indices.append(vocabulary.get_token_index(
                    text, self.namespace))

        return {index_name: indices}
Exemplo n.º 27
0
 def token_to_indices(self, token: Token,
                      vocabulary: Vocabulary) -> List[int]:
     indices = []
     if token.text is None:
         raise ConfigurationError(
             'TokenBPEIndexer needs a tokenizer that retains text')
     for piece in self._bpe_tokenizer.tokenize(token.text):
         if getattr(piece, 'text_id', None) is not None:
             # `text_id` being set on the token means that we aren't using the vocab, we just
             # use this id instead.
             index = piece.text_id
         else:
             index = vocabulary.get_token_index(piece.text, self._namespace)
         indices.append(index)
         if index == 1:
             self.oov_count += 1
         self.total_count += 1
     return indices
Exemplo n.º 28
0
    def tokens_to_indices(self,
                          tokens: List[Token],
                          vocabulary: Vocabulary,
                          index_name: str) -> Dict[str, List[int]]:
        indices: List[int] = []

        for token in itertools.chain(self._start_tokens, tokens, self._end_tokens):
            if getattr(token, 'text_id', None) is not None:
                # `text_id` being set on the token means that we aren't using the vocab, we just use
                # this id instead.
                indices.append(token.text_id)
            else:
                text = token.text
                if self.lowercase_tokens:
                    text = text.lower()
                indices.append(vocabulary.get_token_index(text, self.namespace))

        return {index_name: indices}
Exemplo n.º 29
0
 def tokens_to_indices(self,
                       tokens: List[Token],
                       vocabulary: Vocabulary,
                       index_name: str) -> Dict[str, List[List[int]]]:
     indices: List[List[int]] = []
     for token in itertools.chain(self._start_tokens, tokens, self._end_tokens):
         token_indices: List[int] = []
         if token.text is None:
             raise ConfigurationError('TokenCharactersIndexer needs a tokenizer that retains text')
         for character in self._character_tokenizer.tokenize(token.text):
             if getattr(character, 'text_id', None) is not None:
                 # `text_id` being set on the token means that we aren't using the vocab, we just
                 # use this id instead.
                 index = character.text_id
             else:
                 index = vocabulary.get_token_index(character.text, self._namespace)
             token_indices.append(index)
         indices.append(token_indices)
     return {index_name: indices}
Exemplo n.º 30
0
 def tokens_to_indices(self,
                       tokens: List[Token],
                       vocabulary: Vocabulary,
                       index_name: str) -> Dict[str, List[List[int]]]:
     indices: List[List[int]] = []
     for token in itertools.chain(self._start_tokens, tokens, self._end_tokens):
         token_indices: List[int] = []
         if token.text is None:
             raise ConfigurationError('TokenCharactersIndexer needs a tokenizer that retains text')
         for character in self._character_tokenizer.tokenize(token.text):
             if getattr(character, 'text_id', None) is not None:
                 # `text_id` being set on the token means that we aren't using the vocab, we just
                 # use this id instead.
                 index = character.text_id
             else:
                 index = vocabulary.get_token_index(character.text, self._namespace)
             token_indices.append(index)
         indices.append(token_indices)
     return {index_name: indices}
Exemplo n.º 31
0
    def __init__(self, vocab: Vocabulary, embedder: TextFieldEmbedder,
                 encoder: Seq2SeqEncoder) -> None:
        super().__init__(vocab)

        self._embedder = embedder
        self._encoder = encoder
        self._classifier = torch.nn.Linear(
            in_features=2 * encoder.get_output_dim(),
            out_features=vocab.get_vocab_size('labels'))

        #self._ffnn = torch.nn.Sequential(
        ##linear
        ##dropout
        ##tanh
        ##linear
        #)
        # define f1 here, use as plain F1 measure not spanBased
        self._metric = F1Measure(positive_label=vocab.get_token_index(
            token='positive', namespace='labels'))
Exemplo n.º 32
0
def get_related(token: str,
                embedding: Model,
                vocab: Vocabulary,
                num_related: int = 20):
    """Given a token, return a list of top 20 most similar words to the token."""
    token_id = vocab.get_token_index(token, 'token_in')
    token_vec = embedding.weight[
        token_id]  #A pre-initialization weight matrix for the embedding lookup, allowing the use of pretrained vectors.
    cosine = CosineSimilarity(
        dim=0
    )  #we do this to be able calculate simple cosine similarity between 2 vectors
    sims = Counter()

    for index, token in vocab.get_index_to_token_vocabulary(
            'token_in').items():
        # Cosine similarity of our token vector with every other word vector in the vocabulary
        sim = cosine(token_vec, embedding.weight[index]).item()
        sims[token] = sim  #save the value of cosine similarity

    return sims.most_common(num_related)
Exemplo n.º 33
0
    def tokens_to_indices(self,
                          tokens: List[Token],
                          vocabulary: Vocabulary,
                          index_name: str) -> Dict[str, List[int]]:
        tags: List[str] = []

        for token in tokens:
            if self._coarse_tags:
                tag = token.pos_
            else:
                tag = token.tag_
            if not tag:
                tag = 'NONE'

            tags.append(tag)
        temp = {index_name: [vocabulary.get_token_index(tag, self._namespace) for tag in tags]}
        #temp2 = [vocabulary.get_token_index(tag, self._namespace) for tag in tags]

        # import pdb;
        # pdb.set_trace()
        return temp
Exemplo n.º 34
0
    def tokens_to_indices(self, tokens: List[Token], vocabulary: Vocabulary,
                          index_name: str) -> Dict[str, List[List[int]]]:
        vocab_size = vocabulary.get_vocab_size(self._namespace)

        # Initial steps are exactly the same as super().tokens_to_indices()
        indices: List[List[int]] = []

        for token in itertools.chain(self._start_tokens, tokens,
                                     self._end_tokens):
            token_indices: List[int] = []

            if token.text is None:
                raise ConfigurationError(
                    'BMETokenIndexer needs a tokenizer that retains text')

            for character in self._character_tokenizer.tokenize(token.text):
                if getattr(character, 'text_id', None) is not None:
                    # `text_id` being set on the token means that we aren't using the vocab, we just
                    # use this id instead.
                    index = character.text_id
                else:
                    index = vocabulary.get_token_index(character.text,
                                                       self._namespace)

                token_indices.append(index)

            # Generating BME (steps that are different from super().tokens_to_indices())
            B = F.one_hot(torch.tensor(
                self._pad(token_indices[:self._begin_size], self._begin_size,
                          True)),
                          num_classes=vocab_size).reshape(-1)
            M = F.one_hot(torch.tensor(
                [0] if len(token_indices) is 0 else token_indices),
                          num_classes=vocab_size).sum(0)
            E = F.one_hot(torch.tensor(
                self._pad(token_indices[-self._end_size:], self._end_size)),
                          num_classes=vocab_size).reshape(-1)

            indices.append(torch.cat((B, M, E)).tolist())
        return {index_name: indices}
Exemplo n.º 35
0
    def test_from_params_valid_vocab_extension_thoroughly(self):
        '''
        Tests for Valid Vocab Extension thoroughly: Vocab extension is valid
        when overlapping namespaces have same padding behaviour (padded/non-padded)
        Summary of namespace paddings in this test:
        original_vocab namespaces
            tokens0     padded
            tokens1     non-padded
            tokens2     padded
            tokens3     non-padded
        instances namespaces
            tokens0     padded
            tokens1     non-padded
            tokens4     padded
            tokens5     non-padded
        TypicalExtention example: (of tokens1 namespace)
        -> original_vocab index2token
           apple          #0->apple
           bat            #1->bat
           cat            #2->cat
        -> Token to be extended with: cat, an, apple, banana, atom, bat
        -> extended_vocab: index2token
           apple           #0->apple
           bat             #1->bat
           cat             #2->cat
           an              #3->an
           atom            #4->atom
           banana          #5->banana
        '''

        vocab_dir = self.TEST_DIR / 'vocab_save'
        original_vocab = Vocabulary(non_padded_namespaces=["tokens1", "tokens3"])
        original_vocab.add_token_to_namespace("apple", namespace="tokens0") # index:2
        original_vocab.add_token_to_namespace("bat", namespace="tokens0")   # index:3
        original_vocab.add_token_to_namespace("cat", namespace="tokens0")   # index:4

        original_vocab.add_token_to_namespace("apple", namespace="tokens1") # index:0
        original_vocab.add_token_to_namespace("bat", namespace="tokens1")   # index:1
        original_vocab.add_token_to_namespace("cat", namespace="tokens1")   # index:2

        original_vocab.add_token_to_namespace("a", namespace="tokens2") # index:0
        original_vocab.add_token_to_namespace("b", namespace="tokens2") # index:1
        original_vocab.add_token_to_namespace("c", namespace="tokens2") # index:2

        original_vocab.add_token_to_namespace("p", namespace="tokens3") # index:0
        original_vocab.add_token_to_namespace("q", namespace="tokens3") # index:1

        original_vocab.save_to_files(vocab_dir)

        text_field0 = TextField([Token(t) for t in ["cat", "an", "apple", "banana", "atom", "bat"]],
                                {"tokens0": SingleIdTokenIndexer("tokens0")})
        text_field1 = TextField([Token(t) for t in ["cat", "an", "apple", "banana", "atom", "bat"]],
                                {"tokens1": SingleIdTokenIndexer("tokens1")})
        text_field4 = TextField([Token(t) for t in ["l", "m", "n", "o"]],
                                {"tokens4": SingleIdTokenIndexer("tokens4")})
        text_field5 = TextField([Token(t) for t in ["x", "y", "z"]],
                                {"tokens5": SingleIdTokenIndexer("tokens5")})
        instances = Batch([Instance({"text0": text_field0, "text1": text_field1,
                                     "text4": text_field4, "text5": text_field5})])

        params = Params({"directory_path": vocab_dir,
                         "extend": True,
                         "non_padded_namespaces": ["tokens1", "tokens5"]})
        extended_vocab = Vocabulary.from_params(params, instances)

        # namespaces: tokens0, tokens1 is common.
        # tokens2, tokens3 only vocab has. tokens4, tokens5 only instances
        extended_namespaces = {*extended_vocab._token_to_index}
        assert extended_namespaces == {"tokens{}".format(i) for i in range(6)}

        # # Check that _non_padded_namespaces list is consistent after extension
        assert extended_vocab._non_padded_namespaces == {"tokens1", "tokens3", "tokens5"}

        # # original_vocab["tokens1"] has 3 tokens, instances of "tokens1" ns has 5 tokens. 2 overlapping
        assert extended_vocab.get_vocab_size("tokens1") == 6
        assert extended_vocab.get_vocab_size("tokens0") == 8 # 2 extra overlapping because padded

        # namespace tokens3, tokens4 was only in original_vocab,
        # and its token count should be same in extended_vocab
        assert extended_vocab.get_vocab_size("tokens2") == original_vocab.get_vocab_size("tokens2")
        assert extended_vocab.get_vocab_size("tokens3") == original_vocab.get_vocab_size("tokens3")

        # namespace tokens2 was only in instances,
        # and its token count should be same in extended_vocab
        assert extended_vocab.get_vocab_size("tokens4") == 6 # l,m,n,o + oov + padding
        assert extended_vocab.get_vocab_size("tokens5") == 3 # x,y,z

        # Word2index mapping of all words in all namespaces of original_vocab
        # should be maintained in extended_vocab
        for namespace, token2index in original_vocab._token_to_index.items():
            for token, _ in token2index.items():
                vocab_index = original_vocab.get_token_index(token, namespace)
                extended_vocab_index = extended_vocab.get_token_index(token, namespace)
                assert vocab_index == extended_vocab_index
        # And same for Index2Word mapping
        for namespace, index2token in original_vocab._index_to_token.items():
            for index, _ in index2token.items():
                vocab_token = original_vocab.get_token_from_index(index, namespace)
                extended_vocab_token = extended_vocab.get_token_from_index(index, namespace)
                assert vocab_token == extended_vocab_token
Exemplo n.º 36
0
# coding=utf-8
# @Author: 莫冉
# @Date: 2020-08-06

from allennlp.data.vocabulary import Vocabulary

vocab_file = "../data/base_bert/vocab.txt"
save_path = "../../../vocab_path"

vocab = Vocabulary(padding_token="[PAD]", oov_token="[UNK]")

vocab.set_from_file(vocab_file, is_padded=True, oov_token="[UNK]")

vocab.save_to_files(save_path)

print(vocab.get_token_index(vocab._oov_token))
Exemplo n.º 37
0
    def __init__(
        self,
        vocab: Vocabulary,
        source_embedder: TextFieldEmbedder,
        encoder: Seq2VecEncoder,
        kg_encoder: Seq2VecEncoder,
        max_decoding_steps: int = 64,
        attention: Attention = None,
        target_namespace: str = "tokens",
        scheduled_sampling_ratio: float = 0.4,
    ) -> None:
        super().__init__(vocab)
        self._target_namespace = target_namespace
        self._scheduled_sampling_ratio = scheduled_sampling_ratio  # Maybe we can try
        self._start_index = self.vocab.get_token_index(START_SYMBOL,
                                                       self._target_namespace)
        self._end_index = self.vocab.get_token_index(END_SYMBOL,
                                                     self._target_namespace)
        self.pad_index = self.vocab.get_token_index(self.vocab._padding_token,
                                                    self._target_namespace)
        self.hidden_dim = 300
        self._max_decoding_steps = max_decoding_steps
        self.kd_metric = KD_Metric()
        self.bleu_aver = NLTK_BLEU(ngram_weights=(0.25, 0.25, 0.25, 0.25))
        self.bleu1 = NLTK_BLEU(ngram_weights=(1, 0, 0, 0))
        self.bleu2 = NLTK_BLEU(ngram_weights=(0, 1, 0, 0))
        self.bleu4 = NLTK_BLEU(ngram_weights=(0, 0, 0, 1))
        self.topic_acc = Average()
        self.distinct1 = Distinct1()
        self.distinct2 = Distinct2()
        # anything about module
        self._source_embedder = source_embedder
        num_classes = self.vocab.get_vocab_size(self._target_namespace)
        target_embedding_dim = source_embedder.get_output_dim()
        self._target_embedder = Embedding(num_classes, target_embedding_dim)
        self._encoder = encoder
        self._kg_encoder = kg_encoder
        self._encoder_output_dim = self._encoder.get_output_dim()
        self._decoder_output_dim = self._encoder_output_dim
        # self.select_entity_num = 3
        self._decoder_input_dim = self.hidden_dim * 2 + total_entiy  #self.select_entity_num
        self._attention = None
        if attention:
            self._attention = attention
            self._decoder_input_dim = self._decoder_output_dim + target_embedding_dim

        self._decoder_cell = LSTMCell(self.hidden_dim * 2,
                                      self._decoder_output_dim)
        self._output_projection_layer = Linear(self.hidden_dim, num_classes)
        # with open('cy/comp_topic2num.pk', 'rb') as f:
        with open('fd/word2idx.pk', 'rb') as f:
            self.word_idx = pickle.load(f)
        self.vocab_to_idx = {}
        self.idx_to_vocab_list = []
        for word, k in self.word_idx.items():
            self.vocab_to_idx[vocab.get_token_index(word.strip())] = k
            self.idx_to_vocab_list.append(vocab.get_token_index(word.strip()))
        self.entity_size = total_entiy
        self.entity_embedding = torch.nn.Parameter(
            torch.Tensor(self.entity_size, self.hidden_dim))
        torch.nn.init.xavier_uniform_(self.entity_embedding, gain=1.414)
        self.entity_linear = Linear(self.hidden_dim * 2, self.entity_size)
        self.gen_linear = Linear(self.hidden_dim, 1)
        self.clac_num = 0
Exemplo n.º 38
0
 def token_to_indices(self, token: Token, vocabulary: Vocabulary) -> int:
     dep_label = token.dep_ or 'NONE'
     return vocabulary.get_token_index(dep_label, self.namespace)
Exemplo n.º 39
0
 def index(self, vocab: Vocabulary):
     self._mapping_array = [vocab.get_token_index(x.text, self._target_namespace)
                            for x in self._source_tokens]
Exemplo n.º 40
0
 def index(self, vocab: Vocabulary):
     if self._indexed_labels is None:
         self._indexed_labels = [vocab.get_token_index(label, self._label_namespace)  # type: ignore
                                 for label in self.labels]
Exemplo n.º 41
0
    def __init__(self,
                 vocab: Vocabulary,
                 token_embedder: TextFieldEmbedder,
                 entity_embedder: TextFieldEmbedder,
                 relation_embedder: TextFieldEmbedder,
                 knowledge_graph_path: str,
                 use_shortlist: bool,
                 hidden_size: int,
                 num_layers: int,
                 cutoff: int = 30,
                 tie_weights: bool = False,
                 dropout: float = 0.4,
                 dropouth: float = 0.3,
                 dropouti: float = 0.65,
                 dropoute: float = 0.1,
                 wdrop: float = 0.5,
                 alpha: float = 2.0,
                 beta: float = 1.0,
                 initializer: InitializerApplicator = InitializerApplicator()) -> None:
        super(KglmDisc, self).__init__(vocab)

        # We extract the `Embedding` layers from the `TokenEmbedders` to apply dropout later on.
        # pylint: disable=protected-access
        self._token_embedder = token_embedder._token_embedders['tokens']
        self._entity_embedder = entity_embedder._token_embedders['entity_ids']
        self._relation_embedder = relation_embedder._token_embedders['relations']
        self._recent_entities = RecentEntities(cutoff=cutoff)
        self._knowledge_graph_lookup = KnowledgeGraphLookup(knowledge_graph_path, vocab=vocab)
        self._use_shortlist = use_shortlist
        self._hidden_size = hidden_size
        self._num_layers = num_layers
        self._cutoff = cutoff
        self._tie_weights = tie_weights

        # Dropout
        self._locked_dropout = LockedDropout()
        self._dropout = dropout
        self._dropouth = dropouth
        self._dropouti = dropouti
        self._dropoute = dropoute
        self._wdrop = wdrop

        # Regularization strength
        self._alpha = alpha
        self._beta = beta

        # RNN Encoders.
        entity_embedding_dim = entity_embedder.get_output_dim()
        token_embedding_dim = token_embedder.get_output_dim()
        self.entity_embedding_dim = entity_embedding_dim
        self.token_embedding_dim = token_embedding_dim

        rnns: List[torch.nn.Module] = []
        for i in range(num_layers):
            if i == 0:
                input_size = token_embedding_dim
            else:
                input_size = hidden_size
            if i == num_layers - 1:
                output_size = token_embedding_dim + 2 * entity_embedding_dim
            else:
                output_size = hidden_size
            rnns.append(torch.nn.LSTM(input_size, output_size, batch_first=True))
        rnns = [WeightDrop(rnn, ['weight_hh_l0'], dropout=wdrop) for rnn in rnns]
        self.rnns = torch.nn.ModuleList(rnns)

        # Various linear transformations.
        self._fc_mention_type = torch.nn.Linear(
            in_features=token_embedding_dim,
            out_features=4)

        if not use_shortlist:
            self._fc_new_entity = torch.nn.Linear(
                in_features=entity_embedding_dim,
                out_features=vocab.get_vocab_size('entity_ids'))

            if tie_weights:
                self._fc_new_entity.weight = self._entity_embedder.weight

        self._state: Optional[Dict[str, Any]] = None

        # Metrics
        self._unk_index = vocab.get_token_index(DEFAULT_OOV_TOKEN)
        self._unk_penalty = math.log(vocab.get_vocab_size('tokens_unk'))
        self._avg_mention_type_loss = Average()
        self._avg_new_entity_loss = Average()
        self._avg_knowledge_graph_entity_loss = Average()
        self._new_mention_f1 = F1Measure(positive_label=1)
        self._kg_mention_f1 = F1Measure(positive_label=2)
        self._new_entity_accuracy = CategoricalAccuracy()
        self._new_entity_accuracy20 = CategoricalAccuracy(top_k=20)
        self._parent_ppl = Ppl()
        self._relation_ppl = Ppl()

        initializer(self)
Exemplo n.º 42
0
 def index(self, vocab: Vocabulary):
     if self._label_id is None:
         self._label_id = vocab.get_token_index(self.label, self._label_namespace)  # type: ignore
Exemplo n.º 43
0
 def index(self, vocab: Vocabulary):
     if self.is_global_rule and self._rule_id is None:
         self._rule_id = vocab.get_token_index(self.rule, self._vocab_namespace)
Exemplo n.º 44
0
 def index(self, vocab: Vocabulary):
     if self.labels is not None:
         self._indexed_labels = [
             vocab.get_token_index(label, self._label_namespace)
             for label in self.labels
         ]
Exemplo n.º 45
0
 def index(self, vocab: Vocabulary):
     if self._label_id is None:
         self._label_id = vocab.get_token_index(self.label, self._label_namespace)  # type: ignore
Exemplo n.º 46
0
    def test_from_params_valid_vocab_extension_thoroughly(self):
        """
        Tests for Valid Vocab Extension thoroughly: Vocab extension is valid
        when overlapping namespaces have same padding behaviour (padded/non-padded)
        Summary of namespace paddings in this test:
        original_vocab namespaces
            tokens0     padded
            tokens1     non-padded
            tokens2     padded
            tokens3     non-padded
        instances namespaces
            tokens0     padded
            tokens1     non-padded
            tokens4     padded
            tokens5     non-padded
        TypicalExtention example: (of tokens1 namespace)
        -> original_vocab index2token
           apple          #0->apple
           bat            #1->bat
           cat            #2->cat
        -> Token to be extended with: cat, an, apple, banana, atom, bat
        -> extended_vocab: index2token
           apple           #0->apple
           bat             #1->bat
           cat             #2->cat
           an              #3->an
           atom            #4->atom
           banana          #5->banana
        """

        vocab_dir = self.TEST_DIR / "vocab_save"
        original_vocab = Vocabulary(
            non_padded_namespaces=["tokens1", "tokens3"])
        original_vocab.add_token_to_namespace("apple",
                                              namespace="tokens0")  # index:2
        original_vocab.add_token_to_namespace("bat",
                                              namespace="tokens0")  # index:3
        original_vocab.add_token_to_namespace("cat",
                                              namespace="tokens0")  # index:4

        original_vocab.add_token_to_namespace("apple",
                                              namespace="tokens1")  # index:0
        original_vocab.add_token_to_namespace("bat",
                                              namespace="tokens1")  # index:1
        original_vocab.add_token_to_namespace("cat",
                                              namespace="tokens1")  # index:2

        original_vocab.add_token_to_namespace("a",
                                              namespace="tokens2")  # index:0
        original_vocab.add_token_to_namespace("b",
                                              namespace="tokens2")  # index:1
        original_vocab.add_token_to_namespace("c",
                                              namespace="tokens2")  # index:2

        original_vocab.add_token_to_namespace("p",
                                              namespace="tokens3")  # index:0
        original_vocab.add_token_to_namespace("q",
                                              namespace="tokens3")  # index:1

        original_vocab.save_to_files(vocab_dir)

        text_field0 = TextField(
            [
                Token(t)
                for t in ["cat", "an", "apple", "banana", "atom", "bat"]
            ],
            {"tokens0": SingleIdTokenIndexer("tokens0")},
        )
        text_field1 = TextField(
            [
                Token(t)
                for t in ["cat", "an", "apple", "banana", "atom", "bat"]
            ],
            {"tokens1": SingleIdTokenIndexer("tokens1")},
        )
        text_field4 = TextField([Token(t) for t in ["l", "m", "n", "o"]],
                                {"tokens4": SingleIdTokenIndexer("tokens4")})
        text_field5 = TextField([Token(t) for t in ["x", "y", "z"]],
                                {"tokens5": SingleIdTokenIndexer("tokens5")})
        instances = Batch([
            Instance({
                "text0": text_field0,
                "text1": text_field1,
                "text4": text_field4,
                "text5": text_field5,
            })
        ])

        params = Params({
            "type": "extend",
            "directory": vocab_dir,
            "non_padded_namespaces": ["tokens1", "tokens5"],
        })
        extended_vocab = Vocabulary.from_params(params, instances=instances)

        # namespaces: tokens0, tokens1 is common.
        # tokens2, tokens3 only vocab has. tokens4, tokens5 only instances
        extended_namespaces = {*extended_vocab._token_to_index}
        assert extended_namespaces == {"tokens{}".format(i) for i in range(6)}

        # # Check that _non_padded_namespaces list is consistent after extension
        assert extended_vocab._non_padded_namespaces == {
            "tokens1", "tokens3", "tokens5"
        }

        # # original_vocab["tokens1"] has 3 tokens, instances of "tokens1" ns has 5 tokens. 2 overlapping
        assert extended_vocab.get_vocab_size("tokens1") == 6
        assert extended_vocab.get_vocab_size(
            "tokens0") == 8  # 2 extra overlapping because padded

        # namespace tokens3, tokens4 was only in original_vocab,
        # and its token count should be same in extended_vocab
        assert extended_vocab.get_vocab_size(
            "tokens2") == original_vocab.get_vocab_size("tokens2")
        assert extended_vocab.get_vocab_size(
            "tokens3") == original_vocab.get_vocab_size("tokens3")

        # namespace tokens2 was only in instances,
        # and its token count should be same in extended_vocab
        assert extended_vocab.get_vocab_size(
            "tokens4") == 6  # l,m,n,o + oov + padding
        assert extended_vocab.get_vocab_size("tokens5") == 3  # x,y,z

        # Word2index mapping of all words in all namespaces of original_vocab
        # should be maintained in extended_vocab
        for namespace, token2index in original_vocab._token_to_index.items():
            for token, _ in token2index.items():
                vocab_index = original_vocab.get_token_index(token, namespace)
                extended_vocab_index = extended_vocab.get_token_index(
                    token, namespace)
                assert vocab_index == extended_vocab_index
        # And same for Index2Word mapping
        for namespace, index2token in original_vocab._index_to_token.items():
            for index, _ in index2token.items():
                vocab_token = original_vocab.get_token_from_index(
                    index, namespace)
                extended_vocab_token = extended_vocab.get_token_from_index(
                    index, namespace)
                assert vocab_token == extended_vocab_token
Exemplo n.º 47
0
 def index(self, vocab: Vocabulary):
     if self._label_ids is None:
         self._label_ids = [vocab.get_token_index(label, self._label_namespace)  # type: ignore
                            for label in self.labels]
     if not self._num_labels:
         self._num_labels = vocab.get_vocab_size(self._label_namespace)
Exemplo n.º 48
0
 def index(self, vocab: Vocabulary):
     if self.is_global_rule and self._rule_id is None:
         self._rule_id = vocab.get_token_index(self.rule, self._vocab_namespace)