Exemplo n.º 1
0
    def __init__(self,
                 vocab: Vocabulary,
                 embedder: TextFieldEmbedder,
                 encoder: Seq2VecEncoder,
                 posclass_weight: Optional[float] = 1,
                 use_power: Optional[bool] = False,
                 dropout: Optional[float] = 0) -> None:
        super().__init__(vocab)
        
        self.embedder = embedder
        self.encoder = encoder
        if use_power:
            self.classifier = torch.nn.Linear(
                in_features=encoder.get_output_dim() + 1,
                out_features=vocab.get_vocab_size('labels')
            )
        else:
            self.classifier = torch.nn.Linear(
                in_features=encoder.get_output_dim(),
                out_features=vocab.get_vocab_size('labels')
            )
        self.use_power = use_power
    
        self.f1_lie = F1Measure(vocab.get_token_index('False', 'labels'))
        self.f1_truth = F1Measure(vocab.get_token_index('True', 'labels'))
        self.micro_f1 = FBetaMeasure(average='micro')
        self.macro_f1 = FBetaMeasure(average='macro')
        
        weights = [1,1]
        weights[vocab.get_token_index('False', 'labels')] = posclass_weight        
        self.loss = torch.nn.CrossEntropyLoss(weight = torch.Tensor(weights))

        self.dropout = torch.nn.Dropout(dropout)
Exemplo n.º 2
0
def get_masked_copynet_with_attention(vocab: Vocabulary,
                                      max_decoding_steps: int = 20,
                                      beam_size: int = 1) -> MaskedCopyNet:

    word_embeddings = Embedding(
        num_embeddings=vocab.get_vocab_size("tokens"),
        embedding_dim=EMB_DIM
    )
    word_embeddings = BasicTextFieldEmbedder({"tokens": word_embeddings})

    masker_embeddings = Embedding(
        num_embeddings=vocab.get_vocab_size("mask_tokens"),
        embedding_dim=MASK_EMB_DIM
    )
    masker_embeddings = BasicTextFieldEmbedder({"tokens": masker_embeddings})

    attention = AdditiveAttention(vector_dim=HID_DIM * 2, matrix_dim=HID_DIM * 2)
    mask_attention = AdditiveAttention(vector_dim=HID_DIM * 2, matrix_dim=MASK_EMB_DIM)
    lstm = PytorchSeq2SeqWrapper(nn.LSTM(EMB_DIM, HID_DIM, batch_first=True, bidirectional=True))

    return MaskedCopyNet(
        vocab=vocab,
        embedder=word_embeddings,
        encoder=lstm,
        max_decoding_steps=max_decoding_steps,
        attention=attention,
        mask_embedder=masker_embeddings,
        mask_attention=mask_attention,
        beam_size=beam_size
    )
    def __init__(self,
                 vocab: Vocabulary,
                 embedder: TokenEmbedder,
                 sim_file_path: str,
                 window_size: int = 4,
                 num_neg_samples: int = 5,
                 neg_exponent: float = 0.75,
                 cuda_device: int = -1) -> None:
        super().__init__(vocab)
        self._device = f'cuda:{cuda_device}' if torch.cuda.is_available(
        ) and cuda_device >= 0 else 'cpu'
        self._ws353 = WS353(sim_file_path)

        self.embedder = embedder
        self._window_size = window_size
        self._num_neg_samples = num_neg_samples
        self.output_layer = nn.Linear(embedder.get_output_dim(),
                                      vocab.get_vocab_size('words'))

        # negative sampling with word frequency distribution
        self._word_dist = torch.zeros(vocab.get_vocab_size('words'))
        if vocab._retained_counter:
            for word, count in vocab._retained_counter['words'].items():
                word_idx = vocab.get_token_index(token=word, namespace='words')
                self._word_dist[word_idx] = count
            # prevent sampling process from choosing pad and unk tokens
            self._word_dist[vocab.get_token_index(token=vocab._padding_token,
                                                  namespace='words')] = 0
            self._word_dist[vocab.get_token_index(token=vocab._oov_token,
                                                  namespace='words')] = 0
            # prevent frequent words from sampling too frequently
            self._word_dist = torch.pow(self._word_dist, neg_exponent)
Exemplo n.º 4
0
    def __init__(self,
                 vocab: Vocabulary,
                 embedder: TextFieldEmbedder,
                 encoder: Seq2SeqEncoder,
                 dropout: float = 0.1,
                 ff_dim: int = 100):
        super().__init__(vocab)
        self.embedder = embedder
        self.encoder = encoder

        assert self.embedder.get_output_dim() == self.encoder.get_input_dim()

        self.feedforward = FeedForward(
            encoder.get_output_dim(),
            1,
            hidden_dims=ff_dim,
            activations=Activation.by_name('relu')(),
            dropout=dropout)
        self.out = torch.nn.Linear(
            in_features=self.feedforward.get_output_dim(),
            out_features=vocab.get_vocab_size('labels'))
        self.crf = ConditionalRandomField(vocab.get_vocab_size('labels'))

        self.f1 = FBetaMeasure(average='micro')
        self.accuracy = CategoricalAccuracy()
        self.idx_to_label = vocab.get_index_to_token_vocabulary('labels')
Exemplo n.º 5
0
    def test_namespaces(self):
        vocab = Vocabulary()
        initial_vocab_size = vocab.get_vocab_size()
        word_index = vocab.add_token_to_namespace(u"word", namespace=u'1')
        assert u"word" in list(
            vocab.get_index_to_token_vocabulary(namespace=u'1').values())
        assert vocab.get_token_index(u"word", namespace=u'1') == word_index
        assert vocab.get_token_from_index(word_index,
                                          namespace=u'1') == u"word"
        assert vocab.get_vocab_size(namespace=u'1') == initial_vocab_size + 1

        # Now add it again, in a different namespace and a different word, and make sure it's like
        # new.
        word2_index = vocab.add_token_to_namespace(u"word2", namespace=u'2')
        word_index = vocab.add_token_to_namespace(u"word", namespace=u'2')
        assert u"word" in list(
            vocab.get_index_to_token_vocabulary(namespace=u'2').values())
        assert u"word2" in list(
            vocab.get_index_to_token_vocabulary(namespace=u'2').values())
        assert vocab.get_token_index(u"word", namespace=u'2') == word_index
        assert vocab.get_token_index(u"word2", namespace=u'2') == word2_index
        assert vocab.get_token_from_index(word_index,
                                          namespace=u'2') == u"word"
        assert vocab.get_token_from_index(word2_index,
                                          namespace=u'2') == u"word2"
        assert vocab.get_vocab_size(namespace=u'2') == initial_vocab_size + 2
Exemplo n.º 6
0
    def __init__(self, 
                 vocab: Vocabulary,
                 bert_embedder: Optional[PretrainedBertEmbedder] = None,
                 encoder: Optional[Seq2SeqEncoder] = None,
                 dropout: Optional[float] = None,
                 use_crf: bool = True) -> None:
        super().__init__(vocab)

        if bert_embedder:
            self.use_bert = True
            self.bert_embedder = bert_embedder
        else:
            self.use_bert = False
            self.basic_embedder = BasicTextFieldEmbedder({
                "tokens": Embedding(vocab.get_vocab_size(namespace="tokens"), 1024)
            })
            self.rnn = Seq2SeqEncoder.from_params(Params({     
                "type": "lstm",
                "input_size": 1024,
                "hidden_size": 512,
                "bidirectional": True,
                "batch_first": True
            }))

        self.encoder = encoder

        if encoder:
            hidden2tag_in_dim = encoder.get_output_dim()
        else:
            hidden2tag_in_dim = bert_embedder.get_output_dim()
        self.hidden2tag = TimeDistributed(torch.nn.Linear(
            in_features=hidden2tag_in_dim,
            out_features=vocab.get_vocab_size("labels")))
        
        if dropout:
            self.dropout = torch.nn.Dropout(dropout)
        else:
            self.dropout = None
        
        self.use_crf = use_crf
        if use_crf:
            crf_constraints = allowed_transitions(
                constraint_type="BIO",
                labels=vocab.get_index_to_token_vocabulary("labels")
            )
            self.crf = ConditionalRandomField(
                num_tags=vocab.get_vocab_size("labels"),
                constraints=crf_constraints,
                include_start_end_transitions=True
            )
        
        self.f1 = SpanBasedF1Measure(vocab, 
                                     tag_namespace="labels",
                                     ignore_classes=["news/type","negation",
                                                     "demonstrative_reference",
                                                     "timer/noun","timer/attributes"],
                                     label_encoding="BIO")
Exemplo n.º 7
0
    def __init__(
        self,
        vocab: Vocabulary,
        text_field_embedder: TextFieldEmbedder,
        contextualizer: Seq2SeqEncoder,
        dropout: float = None,
        num_samples: int = None,
        sparse_embeddings: bool = False,
        bidirectional: bool = False,
        initializer: InitializerApplicator = None,
        regularizer: Optional[RegularizerApplicator] = None,
    ) -> None:
        super().__init__(vocab, regularizer)
        self._text_field_embedder = text_field_embedder

        if contextualizer.is_bidirectional() is not bidirectional:
            raise ConfigurationError(
                "Bidirectionality of contextualizer must match bidirectionality of "
                "language model. "
                f"Contextualizer bidirectional: {contextualizer.is_bidirectional()}, "
                f"language model bidirectional: {bidirectional}")

        self._contextualizer = contextualizer
        self._bidirectional = bidirectional

        # The dimension for making predictions just in the forward
        # (or backward) direction.
        if self._bidirectional:
            self._forward_dim = contextualizer.get_output_dim() // 2
        else:
            self._forward_dim = contextualizer.get_output_dim()

        # TODO(joelgrus): more sampled softmax configuration options, as needed.
        if num_samples is not None:
            self._softmax_loss = SampledSoftmaxLoss(
                num_words=vocab.get_vocab_size(),
                embedding_dim=self._forward_dim,
                num_samples=num_samples,
                sparse=sparse_embeddings,
            )
        else:
            self._softmax_loss = _SoftmaxLoss(num_words=vocab.get_vocab_size(),
                                              embedding_dim=self._forward_dim)

        # This buffer is now unused and exists only for backwards compatibility reasons.
        self.register_buffer("_last_average_loss", torch.zeros(1))

        self._perplexity = Perplexity()

        if dropout:
            self._dropout = torch.nn.Dropout(dropout)
        else:
            self._dropout = lambda x: x

        if initializer is not None:
            initializer(self)
Exemplo n.º 8
0
    def __init__(self, vocab: Vocabulary) -> None:
        super().__init__(vocab)

        token_embedding = Embedding(num_embeddings=vocab.get_vocab_size('tokens'),
                                    embedding_dim=EMBEDDING_SIZE)
        self.embedder = BasicTextFieldEmbedder({"tokens": token_embedding})

        self.rnn = PytorchSeq2SeqWrapper(
            torch.nn.LSTM(EMBEDDING_SIZE, HIDDEN_SIZE, batch_first=True))

        self.hidden2out = torch.nn.Linear(in_features=self.rnn.get_output_dim(),
                                          out_features=vocab.get_vocab_size('tokens'))
Exemplo n.º 9
0
    def __init__(
            self,
            cf_a,  # Configuration file
            vocab: Vocabulary) -> None:
        ## We send the vocabulary to the upper model.
        # Apparently AllenNLP needs the vocabulary
        super().__init__(vocab)

        self.cf_a = cf_a
        self.loss_func = cf_a.loss_func
        self.prior = cf_a.LSTM_prior
        """
        Token Embedding Biatch !! 
        """
        self.word_embeddings = self.get_embedder(vocab,
                                                 cf_a.Word_embedding_dim,
                                                 cf_a.char_embeddedng_dim,
                                                 cf_a.CNN_num_filters,
                                                 cf_a.CNN_encoder_dim)

        self.encoder = self.get_sec2vec_encoder(
            cf_a.CNN_encoder_dim + cf_a.Word_embedding_dim, cf_a.LSTM_H)

        if (cf_a.Bayesian_Linear):
            self.hidden2tag = LinearVB(
                in_features=self.cf_a.LSTM_H,
                out_features=vocab.get_vocab_size('tags_country'),
                bias=True,
                prior=cf_a.Linear_output_prior)
        else:
            self.hidden2tag = torch.nn.Linear(
                in_features=self.cf_a.LSTM_H,
                out_features=vocab.get_vocab_size('tags_country'))

        self.accuracy = CategoricalAccuracy()
        """
        List of Bayesian Linear Models.
        Using this list we can easily set the special requirements of VB models.
        And also analize easily the weights in the network
        """
        self.VBmodels = []
        self.LinearModels = []

        if (cf_a.Bayesian_Linear):
            self.VBmodels.append(self.hidden2tag)
        else:
            self.LinearModels.append(self.hidden2tag)

        if (cf_a.Bayesian_LSTM):
            self.VBmodels.extend(self.encoder.get_LSTMCells())
        else:
            self.LinearModels.extend(self.encoder.get_LSTMCells())
Exemplo n.º 10
0
    def __init__(self,
                 vocab: Vocabulary,
                 text_field_embedder: TextFieldEmbedder,
                 contextualizer: Seq2SeqEncoder,
                 dropout: float = None,
                 loss_scale: Union[float, str] = 1.0,
                 num_samples: int = None,
                 sparse_embeddings: bool = False,
                 bidirectional: bool = False,
                 initializer: InitializerApplicator = None) -> None:
        super().__init__(vocab)
        self._text_field_embedder = text_field_embedder

        if contextualizer.is_bidirectional() is not bidirectional:
            raise ConfigurationError(
                "Bidirectionality of contextualizer must match bidirectionality of "
                "language model. "
                f"Contextualizer bidirectional: {contextualizer.is_bidirectional()}, "
                f"language model bidirectional: {bidirectional}")

        self._contextualizer = contextualizer
        self._bidirectional = bidirectional

        # The dimension for making predictions just in the forward
        # (or backward) direction.
        if self._bidirectional:
            self._forward_dim = contextualizer.get_output_dim() // 2
        else:
            self._forward_dim = contextualizer.get_output_dim()

        # TODO(joelgrus): more sampled softmax configuration options, as needed.
        if num_samples is not None:
            self._softmax_loss = SampledSoftmaxLoss(
                num_words=vocab.get_vocab_size(),
                embedding_dim=self._forward_dim,
                num_samples=num_samples,
                sparse=sparse_embeddings)
        else:
            self._softmax_loss = _SoftmaxLoss(num_words=vocab.get_vocab_size(),
                                              embedding_dim=self._forward_dim)

        # TODO(brendanr): Output perplexity here. e^loss
        self.register_buffer('_last_average_loss', torch.zeros(1))

        if dropout:
            self._dropout = torch.nn.Dropout(dropout)
        else:
            self._dropout = lambda x: x

        self._loss_scale = loss_scale
        if initializer is not None:
            initializer(self)
Exemplo n.º 11
0
    def __init__(self,
                 vocab: Vocabulary,
                 text_field_embedder: TextFieldEmbedder,
                 contextualizer: Seq2SeqEncoder,
                 dropout: float = None,
                 num_samples: int = None,
                 sparse_embeddings: bool = False,
                 bidirectional: bool = False,
                 initializer: InitializerApplicator = None) -> None:
        super().__init__(vocab)
        self._text_field_embedder = text_field_embedder

        if contextualizer.is_bidirectional() is not bidirectional:
            raise ConfigurationError(
                    "Bidirectionality of contextualizer must match bidirectionality of "
                    "language model. "
                    f"Contextualizer bidirectional: {contextualizer.is_bidirectional()}, "
                    f"language model bidirectional: {bidirectional}")

        self._contextualizer = contextualizer
        self._bidirectional = bidirectional

        # The dimension for making predictions just in the forward
        # (or backward) direction.
        if self._bidirectional:
            self._forward_dim = contextualizer.get_output_dim() // 2
        else:
            self._forward_dim = contextualizer.get_output_dim()

        # TODO(joelgrus): more sampled softmax configuration options, as needed.
        if num_samples is not None:
            self._softmax_loss = SampledSoftmaxLoss(num_words=vocab.get_vocab_size(),
                                                    embedding_dim=self._forward_dim,
                                                    num_samples=num_samples,
                                                    sparse=sparse_embeddings)
        else:
            self._softmax_loss = _SoftmaxLoss(num_words=vocab.get_vocab_size(),
                                              embedding_dim=self._forward_dim)

        # TODO(brendanr): Output perplexity here. e^loss
        self.register_buffer('_last_average_loss', torch.zeros(1))

        if dropout:
            self._dropout = torch.nn.Dropout(dropout)
        else:
            self._dropout = lambda x: x

        if initializer is not None:
            initializer(self)
Exemplo n.º 12
0
def main():
    reader = SkipGramReader()
    dataset = reader.read("data/cv/0/train.txt")
    vocab = Vocabulary().from_files("data/vocabulary")
    params = Params(params={})
    vocab.extend_from_instances(params, dataset)

    reader = SkipGramReader(vocab=vocab)
    dataset = reader.read("data/cv/0/train.txt")
    embedding_in = Embedding(num_embeddings=vocab.get_vocab_size('token_in'),
                             embedding_dim=EMBEDDING_DIM)
    embedding_out = Embedding(num_embeddings=vocab.get_vocab_size('token_out'),
                              embedding_dim=EMBEDDING_DIM)
    
    
    if CUDA_DEVICE > -1:
        embedding_in = embedding_in.to(CUDA_DEVICE)
        embedding_out = embedding_out.to(CUDA_DEVICE)
    iterator = BasicIterator(batch_size=BATCH_SIZE)
    iterator.index_with(vocab)

    model = SkipGramModel(vocab=vocab,
                          embedding_in=embedding_in,
                          cuda_device=CUDA_DEVICE)

    # model = SkipGramNegativeSamplingModel(
    #     vocab=vocab,
    #     embedding_in=embedding_in,
    #     embedding_out=embedding_out,
    #     neg_samples=10,
    #     cuda_device=CUDA_DEVICE)

    optimizer = optim.Adam(model.parameters())

    trainer = Trainer(model=model,
                      optimizer=optimizer,
                      iterator=iterator,
                      train_dataset=dataset,
                      num_epochs=20,
                      cuda_device=CUDA_DEVICE)
    trainer.train()

    torch.save(embedding_in.state_dict(), "saved_models/word2vec.th")

    print(get_synonyms('C', embedding_in, vocab))
    print(get_synonyms('G7', embedding_in, vocab))
    print(get_synonyms('G', embedding_in, vocab))
    print(get_synonyms('F', embedding_in, vocab))
    print(get_synonyms('C7', embedding_in, vocab))
Exemplo n.º 13
0
    def test_add_word_to_index_gives_consistent_results(self):
        vocab = Vocabulary()
        initial_vocab_size = vocab.get_vocab_size()
        word_index = vocab.add_token_to_namespace("word")
        assert "word" in vocab.get_index_to_token_vocabulary().values()
        assert vocab.get_token_index("word") == word_index
        assert vocab.get_token_from_index(word_index) == "word"
        assert vocab.get_vocab_size() == initial_vocab_size + 1

        # Now add it again, and make sure nothing changes.
        vocab.add_token_to_namespace("word")
        assert "word" in vocab.get_index_to_token_vocabulary().values()
        assert vocab.get_token_index("word") == word_index
        assert vocab.get_token_from_index(word_index) == "word"
        assert vocab.get_vocab_size() == initial_vocab_size + 1
Exemplo n.º 14
0
    def test_add_word_to_index_gives_consistent_results(self):
        vocab = Vocabulary()
        initial_vocab_size = vocab.get_vocab_size()
        word_index = vocab.add_token_to_namespace("word")
        assert "word" in vocab.get_index_to_token_vocabulary().values()
        assert vocab.get_token_index("word") == word_index
        assert vocab.get_token_from_index(word_index) == "word"
        assert vocab.get_vocab_size() == initial_vocab_size + 1

        # Now add it again, and make sure nothing changes.
        vocab.add_token_to_namespace("word")
        assert "word" in vocab.get_index_to_token_vocabulary().values()
        assert vocab.get_token_index("word") == word_index
        assert vocab.get_token_from_index(word_index) == "word"
        assert vocab.get_vocab_size() == initial_vocab_size + 1
Exemplo n.º 15
0
    def __init__(self, word_embeddings: TextFieldEmbedder,
                 encoder: Seq2VecEncoder, vocab: Vocabulary) -> None:
        super().__init__(vocab)
        self.word_embeddings = word_embeddings
        self.encoder = encoder

        self.hidden2decision = torch.nn.Linear(
            in_features=encoder.get_output_dim(),
            out_features=vocab.get_vocab_size("grammaticality_labels"))
        self.loss_function = nn.CrossEntropyLoss()
        self.accuracy = CategoricalAccuracy()
        self.vocab = vocab
        self.specificAccuracies = {}
        for ind in range(vocab.get_vocab_size(namespace="ugtype_labels")):
            self.specificAccuracies[ind] = CategoricalAccuracy()
Exemplo n.º 16
0
    def __init__(self,
                 vocab: Vocabulary,
                 text_field_embedder: TextFieldEmbedder,
                 contextualizer: Seq2SeqEncoder,
                 forward_segmental_contextualizer: Seq2SeqEncoder,
                 backward_segmental_contextualizer: Seq2SeqEncoder,
                 label_feature_dim: int,
                 softmax_projection_dim: int,
                 label_namespace: str = "labels",
                 dropout: float = None,
                 num_samples: int = None,
                 sparse_embeddings: bool = False,
                 bidirectional: bool = True,
                 initializer: InitializerApplicator = None) -> None:
        super().__init__(vocab=vocab,
                         text_field_embedder=text_field_embedder,
                         contextualizer=contextualizer,
                         dropout=dropout,
                         num_samples=num_samples,
                         sparse_embeddings=sparse_embeddings,
                         bidirectional=bidirectional,
                         initializer=initializer)
        self._forward_segmental_contextualizer = forward_segmental_contextualizer
        self._backward_segmental_contextualizer = backward_segmental_contextualizer

        if num_samples is not None:
            self._softmax_loss = SampledSoftmaxLoss(
                num_words=vocab.get_vocab_size(),
                embedding_dim=softmax_projection_dim,
                num_samples=num_samples,
                sparse=sparse_embeddings)
        else:
            self._softmax_loss = _SoftmaxLoss(
                num_words=vocab.get_vocab_size(),
                embedding_dim=softmax_projection_dim)

        self.num_classes = self.vocab.get_vocab_size(label_namespace)
        self.label_feature_embedding = Embedding(self.num_classes,
                                                 label_feature_dim)

        base_dim = contextualizer.get_output_dim() // 2
        seg_dim = base_dim + label_feature_dim
        self._forward_dim = softmax_projection_dim

        self.pre_segmental_layer = TimeDistributed(
            Linear(seg_dim, softmax_projection_dim))
        self.projection_layer = TimeDistributed(
            Linear(base_dim * 2, softmax_projection_dim))
Exemplo n.º 17
0
def generate_distance_target(index, eps=1):
    vocab = Vocabulary().from_files("data/vocabulary")

    vocab_size = vocab.get_vocab_size()
    weight = np.zeros((vocab_size, vocab_size))

    if index == 0:
        dist_func = distance_0
    if index == 1:
        dist_func = distance_1
    if index == 2:
        dist_func = distance_2

    for i in range(vocab_size):
        chord_i = vocab.get_token_from_index(i)
        for j in range(vocab_size):
            chord_j = vocab.get_token_from_index(j)
            if "@" in chord_i or "@" in chord_j:
                M = 1 - distance_0(chord_i, chord_j)
            else:
                dist = dist_func(chord_i, chord_j)
                M = 1 / (dist + eps)
            weight[i][j] = M

    max_value = np.max(weight)
    weight /= max_value

    weight = torch.from_numpy(weight).float()
    if not os.path.isdir("data/targets/"):
        os.makedirs("data/targets/")

    torch.save(weight, "data/targets/target_distance_{}.th".format(index))
Exemplo n.º 18
0
    def __init__(self,
                 vocab: Vocabulary,
                 source_text_embedder: TextFieldEmbedder,
                 encoder: Seq2VecEncoder,
                 tied_source_embedder_key: Optional[str] = None,
                 initializer: InitializerApplicator = InitializerApplicator(),
                 regularizer: Optional[RegularizerApplicator] = None,
                 positive_label: str = "algebra",  
                 target_namespace: str = "tokens")-> None:

        super(TextClassifier, self).__init__(vocab, regularizer)

        self._source_text_embedder = source_text_embedder
        self._target_namespace = target_namespace
        self._encoder = encoder
        self._linear = torch.nn.Linear(in_features=encoder.get_output_dim(), 
                                        out_features=vocab.get_vocab_size('labels'))
        self._start_index = self.vocab.get_token_index(START_SYMBOL, self._target_namespace)
        self._end_index = self.vocab.get_token_index(END_SYMBOL, self._target_namespace)
        
        self.accuracy = CategoricalAccuracy()
        positive_label = vocab.get_token_index(positive_label, namespace='labels')
        # for comnputing precision, recall and f1
        self.f1_measure = F1Measure(positive_label)

        # the loss function combines logsoftmax and NLLloss, the input to this function is logits
        self.loss_function = torch.nn.CrossEntropyLoss()  

        
        initializer(self)
Exemplo n.º 19
0
    def __init__(self,
                 vocab: Vocabulary,
                 text_field_embedder: TextFieldEmbedder,
                 contextualizer: Seq2SeqEncoder,
                 layer_norm: Optional[MaskedLayerNorm] = None,
                 dropout: float = None,
                 loss_scale: Union[float, str] = 1.0,
                 remove_bos_eos: bool = True) -> None:
        super().__init__(vocab)
        self._text_field_embedder = text_field_embedder
        self._layer_norm = layer_norm or (lambda x: x)

        if not contextualizer.is_bidirectional():
            raise ConfigurationError("contextualizer must be bidirectional")

        self._contextualizer = contextualizer
        # The dimension for making predictions just in the forward
        # (or backward) direction.
        self._forward_dim = contextualizer.get_output_dim() // 2

        # TODO(joelgrus): Allow SampledSoftmaxLoss here by configuration
        self._softmax_loss = _SoftmaxLoss(num_words=vocab.get_vocab_size(),
                                          embedding_dim=self._forward_dim)

        self.register_buffer('_last_average_loss', torch.zeros(1))

        if dropout:
            self._dropout = torch.nn.Dropout(dropout)
        else:
            self._dropout = lambda x: x

        self._loss_scale = loss_scale
        self._remove_bos_eos = remove_bos_eos
    def __init__(self, vocab: Vocabulary,
                 embedding_target: TokenEmbedder,
                 embedding_context: TokenEmbedder,
                 neg_samples=10, cuda_device=-1):
        super().__init__(vocab)
        self.embedding_target = embedding_target
        self.embedding_context = embedding_context
        self.neg_samples = neg_samples
        self.cuda_device = cuda_device

        # Pre-compute probability for negative sampling
        if vocab is not None and 'token_target' in vocab._retained_counter:
            token_to_probs = {}
            token_counts = vocab._retained_counter['token_target']  # HACK
            total_counts = sum(token_counts.values())
            total_probs = 0.
            for token, counts in token_counts.items():
                unigram_freq = counts / total_counts
                unigram_freq = math.pow(unigram_freq, 0.75)
                token_to_probs[token] = unigram_freq
                total_probs += unigram_freq

            self.neg_sample_probs = np.ndarray((vocab.get_vocab_size('token_target'),))
            for token_id, token in vocab.get_index_to_token_vocabulary('token_target').items():
                self.neg_sample_probs[token_id] = token_to_probs.get(token, 0) / total_probs

        else:
            print('You need to construct vocab from instances to record the token count statistics')
Exemplo n.º 21
0
def build_model(
        vocab: Vocabulary,
        embedding_dim: int,
        pretrained_file: str = None,
        initializer: InitializerApplicator = None,
        regularizer: RegularizerApplicator = None
        ) -> Model:
    print("Building the model")
    vocab_size = vocab.get_vocab_size("tokens")
    word_vec = Embedding(embedding_dim=embedding_dim,
                          num_embeddings=vocab_size,
                          pretrained_file=pretrained_file,
                          vocab=vocab)
    embedding = BasicTextFieldEmbedder({"tokens": word_vec})

    # Use ELMo
    # options_file = 'https://s3-us-west-2.amazonaws.com/allennlp/models/elmo/2x1024_128_2048cnn_1xhighway/elmo_2x1024_128_2048cnn_1xhighway_options.json'
    # weight_file = 'https://s3-us-west-2.amazonaws.com/allennlp/models/elmo/2x1024_128_2048cnn_1xhighway/elmo_2x1024_128_2048cnn_1xhighway_weights.hdf5'
    # elmo_embedder = ElmoTokenEmbedder(options_file, weight_file)
    # embedding = BasicTextFieldEmbedder({"tokens": elmo_embedder})

    # Use BERT
    # bert_embedder = PretrainedTransformerEmbedder(
    #     model_name='bert-base-uncased',
    #     max_length=512,
    #     train_parameters=False
    # )
    # embedding = BasicTextFieldEmbedder({"tokens": bert_embedder})

    encoder = BagOfEmbeddingsEncoder(embedding_dim=embedding_dim)
    return SimpleClassifier(vocab, embedding, encoder, initializer, regularizer=regularizer)
 def __init__(self, text_field_embedder: TextFieldEmbedder,
              type_field_embedder: TextFieldEmbedder,
              vocab: Vocabulary) -> None:
     super().__init__(vocab)
     self.text_field_embedder = text_field_embedder
     # self.hidden2tag = torch.nn.Linear(in_features=self.text_field_embedder.get_output_dim(),
     #                                   out_features=vocab.get_vocab_size('labels'))
     self.type_field_embedder = type_field_embedder
     self.hidden2medium = torch.nn.Linear(
         in_features=self.text_field_embedder.get_output_dim() + 20,
         out_features=int(
             (self.text_field_embedder.get_output_dim() + 20) / 2))
     self.dropout = torch.nn.Dropout(0.2)
     self.medium2tag = torch.nn.Linear(
         in_features=int(
             (self.text_field_embedder.get_output_dim() + 20) / 2),
         out_features=vocab.get_vocab_size("labels"))
     self.loss = FocalLoss()
     self.metrics = {
         # "accuracy": CategoricalAccuracy(),
         "f1_measure":
         SpanBasedF1Measure(vocabulary=vocab,
                            tag_namespace="labels",
                            ignore_classes=[""])
     }
Exemplo n.º 23
0
 def add_task(self, task_tag: str, vocab: Vocabulary):
     self.classification_layers.append(
         torch.nn.Linear(in_features=self.hidden_dim,
                         out_features=vocab.get_vocab_size('labels')))
     self.num_task = self.num_task + 1
     self.task2id[task_tag] = self.num_task
     self.tasks_vocabulary[task_tag] = vocab
Exemplo n.º 24
0
    def __init__(
        self,
        vocab: Vocabulary,
        bert_model: Union[str, BertModel],
        dropout: float = 0.0,
        num_labels: int = None,
        index: str = "bert",
        label_namespace: str = "labels",
        trainable: bool = True,
        initializer: InitializerApplicator = InitializerApplicator()
    ) -> None:
        super().__init__(vocab)

        if isinstance(bert_model, str):
            self.bert_model = PretrainedBertModel.load(bert_model)
        else:
            self.bert_model = bert_model
        self.bert_model.requires_grad = trainable

        in_features = self.bert_model.config.hidden_size
        if num_labels:
            out_features = num_labels
        else:
            out_features = vocab.get_vocab_size(label_namespace)

        self._dropout = torch.nn.Dropout(p=dropout)
        self._tagger_layer = torch.nn.Linear(in_features, out_features)
        self._span_f1 = SpanBasedF1Measure(vocab,
                                           label_namespace,
                                           label_encoding='BIO')
        self._loss = torch.nn.CrossEntropyLoss()
        self._index = index
        initializer(self._tagger_layer)
Exemplo n.º 25
0
    def __init__(
        self,
        vocab: Vocabulary,
        embedder: TextFieldEmbedder,
        encoder: Seq2SeqEncoder = None,
        dropout: float = 0.5,
        label_namespace: str = "entity_tags",
    ) -> None:
        super().__init__(vocab)
        self.vocab = vocab
        self.embedder = embedder
        self.encoder = encoder
        self.dropout = Dropout(dropout)

        self.label_namespace = label_namespace
        self.labels = vocab.get_index_to_token_vocabulary(label_namespace)
        num_labels = vocab.get_vocab_size(label_namespace)

        self.label_projection_layer = TimeDistributed(
            torch.nn.Linear(
                embedder.get_output_dim()
                if encoder is None else encoder.get_output_dim(), num_labels))
        self.crf = ConditionalRandomField(num_labels,
                                          include_start_end_transitions=True)

        self.metrics = {
            "span_f1":
            SpanBasedF1Measure(vocab,
                               tag_namespace=label_namespace,
                               label_encoding="BIO"),
            "accuracy":
            CategoricalAccuracy(),
        }
Exemplo n.º 26
0
    def __init__(self,
                 vocab: Vocabulary,
                 action_embedding_dim: int,
                 text_field_embedder: TextFieldEmbedder = None,
                 dropout: float = 0.0,
                 rule_namespace: str = 'rule_labels',
                 debug: bool=False,
                 regularizer: Optional[RegularizerApplicator] = None) -> None:
        super(DROPParserBase, self).__init__(vocab=vocab, regularizer=regularizer)

        self._denotation_accuracy = Average()
        self._consistency = Average()

        if dropout > 0:
            self._dropout = torch.nn.Dropout(p=dropout)
        else:
            self._dropout = lambda x: x
        self._rule_namespace = rule_namespace

        # This flag turns on the debugging mode which prints a bunch of stuff in self.decode (inside functions as well)
        self._debug = debug

        self._action_embedder = Embedding(num_embeddings=vocab.get_vocab_size(self._rule_namespace),
                                          embedding_dim=action_embedding_dim,
                                          vocab_namespace=self._rule_namespace)

        self._action_embedding_dim = action_embedding_dim
        # This is what we pass as input in the first step of decoding, when we don't have a
        # previous action.
        self._first_action_embedding = torch.nn.Parameter(torch.FloatTensor(action_embedding_dim))
        torch.nn.init.normal_(self._first_action_embedding, mean=0.0, std=0.001)
Exemplo n.º 27
0
def build_seq2seq_model(flags,
                        data_reader,
                        vocab: Vocabulary,
                        source_namespace: str = 'source_tokens',
                        target_namespace: str = 'target_tokens') -> Model:
    source_embedding = Embedding(
        vocab.get_vocab_size(namespace=source_namespace),
        embedding_dim=flags.source_embedding_dim)
    source_embedder = BasicTextFieldEmbedder({'tokens': source_embedding})
    lstm_encoder = PytorchSeq2SeqWrapper(
        torch.nn.LSTM(flags.source_embedding_dim,
                      flags.encoder_hidden_dim,
                      batch_first=True,
                      bidirectional=flags.encoder_bidirectional))
    attention = DotProductAttention()
    model = SimpleSeq2Seq(vocab,
                          source_embedder,
                          lstm_encoder,
                          flags.max_decode_length,
                          target_embedding_dim=flags.decoder_hidden_dim,
                          target_namespace=target_namespace,
                          attention=attention,
                          beam_size=flags.beam_size,
                          use_bleu=True)
    return model
Exemplo n.º 28
0
 def __init__(
     self,
     #### The embedding layer is specified as an AllenNLP <code>TextFieldEmbedder</code>
     #### which represents a general way of turning tokens into tensors.
     #### (Here we know that we want to represent each unique word with a learned tensor,
     #### but using the general class allows us to easily experiment with different types
     #### of embeddings, for example <a href = "https://allennlp.org/elmo">ELMo</a>.)
     word_embeddings: TextFieldEmbedder,
     #### Similarly, the encoder is specified as a general <code>Seq2SeqEncoder</code>
     #### even though we know we want to use an LSTM. Again, this makes it easy to
     #### experiment with other sequence encoders, for example a Transformer.
     encoder: Seq2SeqEncoder,
     #### Every AllenNLP model also expects a <code>Vocabulary</code>,
     #### which contains the namespaced mappings of tokens to indices and labels to indices.
     vocab: Vocabulary
 ) -> None:
     #### Notice that we have to pass the vocab to the base class constructor.
     super().__init__(vocab)
     self.word_embeddings = word_embeddings
     self.encoder = encoder
     #### The feed forward layer is not passed in as a parameter, but is constructed by us.
     #### Notice that it looks at the encoder to find the correct input dimension and looks
     #### at the vocabulary (and, in particular, at the label -> index mapping) to find the correct output dimension.
     self.hidden2tag = torch.nn.Linear(
         in_features=encoder.get_output_dim(),
         out_features=vocab.get_vocab_size('labels'))
     #### The last thing to notice is that we also instantiate a
     #### <code>CategoricalAccuracy</code> metric, which we'll use to track accuracy
     #### during each training and validation epoch.
     self.accuracy = CategoricalAccuracy()
Exemplo n.º 29
0
    def __init__(self,
                 vocab: Vocabulary,
                 embedding_dim: int,
                 use_crf: bool = False,
                 label_namespace: str = "xpos_tags"):
        super().__init__(vocab)
        self.label_namespace = label_namespace
        self.labels = vocab.get_index_to_token_vocabulary(label_namespace)
        num_labels = vocab.get_vocab_size(label_namespace)

        if use_crf:
            self.crf = ConditionalRandomField(
                num_labels, include_start_end_transitions=True)
            self.label_projection_layer = TimeDistributed(
                torch.nn.Linear(embedding_dim, num_labels))
            self.decoder = None
        else:
            self.crf = None
            self.decoder = GruSeq2SeqEncoder(input_size=embedding_dim,
                                             hidden_size=embedding_dim,
                                             num_layers=1,
                                             bidirectional=True)
            self.label_projection_layer = TimeDistributed(
                torch.nn.Linear(self.decoder.get_output_dim(), num_labels))

        from allennlp.training.metrics import CategoricalAccuracy

        self.metrics = {"accuracy": CategoricalAccuracy()}
Exemplo n.º 30
0
    def __init__(self,
                 vocab: Vocabulary,
                 sentence_embedder: TextFieldEmbedder,
                 action_embedding_dim: int,
                 encoder: Seq2SeqEncoder,
                 dropout: float = 0.0,
                 rule_namespace: str = 'rule_labels') -> None:
        super(NlvrSemanticParser, self).__init__(vocab=vocab)

        self._sentence_embedder = sentence_embedder
        self._denotation_accuracy = Average()
        self._consistency = Average()
        self._encoder = encoder
        if dropout > 0:
            self._dropout = torch.nn.Dropout(p=dropout)
        else:
            self._dropout = lambda x: x
        self._rule_namespace = rule_namespace

        self._action_embedder = Embedding(num_embeddings=vocab.get_vocab_size(
            self._rule_namespace),
                                          embedding_dim=action_embedding_dim)

        # This is what we pass as input in the first step of decoding, when we don't have a
        # previous action.
        self._first_action_embedding = torch.nn.Parameter(
            torch.FloatTensor(action_embedding_dim))
        torch.nn.init.normal_(self._first_action_embedding)
Exemplo n.º 31
0
    def __init__(self,
                 word_embeddings: TextFieldEmbedder,
                 encoder: Seq2VecEncoder,
                 vocab: Vocabulary,
                 positive_label: int = 4) -> None:
        super().__init__(vocab)
        # We need the embeddings to convert word IDs to their vector representations
        self.word_embeddings = word_embeddings

        self.encoder = encoder

        # After converting a sequence of vectors to a single vector, we feed it into
        # a fully-connected linear layer to reduce the dimension to the total number of labels.
        self.linear = torch.nn.Linear(
            in_features=encoder.get_output_dim(),
            out_features=vocab.get_vocab_size('labels'))

        # Monitor the metrics - we use accuracy, as well as prec, rec, f1 for 4 (very positive)
        self.accuracy = CategoricalAccuracy()
        self.f1_measure = F1Measure(positive_label)

        # We use the cross entropy loss because this is a classification task.
        # Note that PyTorch's CrossEntropyLoss combines softmax and log likelihood loss,
        # which makes it unnecessary to add a separate softmax layer.
        self.loss_function = torch.nn.CrossEntropyLoss()
Exemplo n.º 32
0
    def __init__(self,
                 vocab: Vocabulary,
                 sentence_embedder: TextFieldEmbedder,
                 action_embedding_dim: int,
                 encoder: Seq2SeqEncoder,
                 dropout: float = 0.0,
                 rule_namespace: str = 'rule_labels') -> None:
        super(NlvrSemanticParser, self).__init__(vocab=vocab)

        self._sentence_embedder = sentence_embedder
        self._denotation_accuracy = Average()
        self._consistency = Average()
        self._encoder = encoder
        if dropout > 0:
            self._dropout = torch.nn.Dropout(p=dropout)
        else:
            self._dropout = lambda x: x
        self._rule_namespace = rule_namespace

        self._action_embedder = Embedding(num_embeddings=vocab.get_vocab_size(self._rule_namespace),
                                          embedding_dim=action_embedding_dim)

        # This is what we pass as input in the first step of decoding, when we don't have a
        # previous action.
        self._first_action_embedding = torch.nn.Parameter(torch.FloatTensor(action_embedding_dim))
        torch.nn.init.normal_(self._first_action_embedding)
Exemplo n.º 33
0
    def __init__(self, vocab: Vocabulary, embedding_in: Embedding, embedding_out: Embedding, neg_samples=10, cuda_device=-1):
        super().__init__(vocab)
        self.embedding_in = embedding_in
        self.embedding_out = embedding_out
        self.neg_samples = neg_samples
        self.cuda_device = cuda_device

        check_if_counter = getattr(vocab, '_retained_counter', None)
        if not check_if_counter:
            return
        # pre-compute probability for negative sampling
        token_to_probs = {}
        token_counts = vocab._retained_counter['tags_in']
        assert len(token_counts) > 2

        total_counts = sum(token_counts.values())
        total_probs = 0.

        for token, counts in token_counts.items():
            unigram_freq = counts / total_counts
            unigram_freq = math.pow(unigram_freq, 3 / 4)
            token_to_probs[token] = unigram_freq
            total_probs += unigram_freq

        self.neg_sample_probs = np.ndarray((vocab.get_vocab_size('tags_in'),))
        for token_id , token in vocab.get_index_to_token_vocabulary('tags_in').items():
            self.neg_sample_probs[token_id] = token_to_probs.get(token, 0) / total_probs
Exemplo n.º 34
0
 def __init__(self,
              word_embeddings: TextFieldEmbedder,
              encoder: Seq2SeqEncoder,
              vocab: Vocabulary) -> None:
     super().__init__(vocab)
     self.word_embeddings = word_embeddings
     self.encoder = encoder
     self.hidden2tag = torch.nn.Linear(in_features=encoder.get_output_dim(),
                                       out_features=vocab.get_vocab_size('labels'))
     self.accuracy = CategoricalAccuracy()
Exemplo n.º 35
0
    def test_namespaces(self):
        vocab = Vocabulary()
        initial_vocab_size = vocab.get_vocab_size()
        word_index = vocab.add_token_to_namespace("word", namespace='1')
        assert "word" in vocab.get_index_to_token_vocabulary(namespace='1').values()
        assert vocab.get_token_index("word", namespace='1') == word_index
        assert vocab.get_token_from_index(word_index, namespace='1') == "word"
        assert vocab.get_vocab_size(namespace='1') == initial_vocab_size + 1

        # Now add it again, in a different namespace and a different word, and make sure it's like
        # new.
        word2_index = vocab.add_token_to_namespace("word2", namespace='2')
        word_index = vocab.add_token_to_namespace("word", namespace='2')
        assert "word" in vocab.get_index_to_token_vocabulary(namespace='2').values()
        assert "word2" in vocab.get_index_to_token_vocabulary(namespace='2').values()
        assert vocab.get_token_index("word", namespace='2') == word_index
        assert vocab.get_token_index("word2", namespace='2') == word2_index
        assert vocab.get_token_from_index(word_index, namespace='2') == "word"
        assert vocab.get_token_from_index(word2_index, namespace='2') == "word2"
        assert vocab.get_vocab_size(namespace='2') == initial_vocab_size + 2
Exemplo n.º 36
0
 def __init__(self,
              #### The embedding layer is specified as an AllenNLP <code>TextFieldEmbedder</code> which represents a general way of turning tokens into tensors. (Here we know that we want to represent each unique word with a learned tensor, but using the general class allows us to easily experiment with different types of embeddings, for example <a href = "https://allennlp.org/elmo">ELMo</a>.)
              word_embeddings: TextFieldEmbedder,
              #### Similarly, the encoder is specified as a general <code>Seq2SeqEncoder</code> even though we know we want to use an LSTM. Again, this makes it easy to experiment with other sequence encoders, for example a Transformer.
              encoder: Seq2SeqEncoder,
              #### Every AllenNLP model also expects a <code>Vocabulary</code>, which contains the namespaced mappings of tokens to indices and labels to indices.
              vocab: Vocabulary) -> None:
     #### Notice that we have to pass the vocab to the base class constructor.
     super().__init__(vocab)
     self.word_embeddings = word_embeddings
     self.encoder = encoder
     #### The feed forward layer is not passed in as a parameter, but is constructed by us. Notice that it looks at the encoder to find the correct input dimension and looks at the vocabulary (and, in particular, at the label -> index mapping) to find the correct output dimension.
     self.hidden2tag = torch.nn.Linear(in_features=encoder.get_output_dim(),
                                       out_features=vocab.get_vocab_size('labels'))
     #### The last thing to notice is that we also instantiate a <code>CategoricalAccuracy</code> metric, which we'll use to track accuracy during each training and validation epoch.
     self.accuracy = CategoricalAccuracy()
Exemplo n.º 37
0
 def index(self, vocab: Vocabulary):
     if self._label_ids is None:
         self._label_ids = [vocab.get_token_index(label, self._label_namespace)  # type: ignore
                            for label in self.labels]
     if not self._num_labels:
         self._num_labels = vocab.get_vocab_size(self._label_namespace)
Exemplo n.º 38
0
    def test_from_params_valid_vocab_extension_thoroughly(self):
        '''
        Tests for Valid Vocab Extension thoroughly: Vocab extension is valid
        when overlapping namespaces have same padding behaviour (padded/non-padded)
        Summary of namespace paddings in this test:
        original_vocab namespaces
            tokens0     padded
            tokens1     non-padded
            tokens2     padded
            tokens3     non-padded
        instances namespaces
            tokens0     padded
            tokens1     non-padded
            tokens4     padded
            tokens5     non-padded
        TypicalExtention example: (of tokens1 namespace)
        -> original_vocab index2token
           apple          #0->apple
           bat            #1->bat
           cat            #2->cat
        -> Token to be extended with: cat, an, apple, banana, atom, bat
        -> extended_vocab: index2token
           apple           #0->apple
           bat             #1->bat
           cat             #2->cat
           an              #3->an
           atom            #4->atom
           banana          #5->banana
        '''

        vocab_dir = self.TEST_DIR / 'vocab_save'
        original_vocab = Vocabulary(non_padded_namespaces=["tokens1", "tokens3"])
        original_vocab.add_token_to_namespace("apple", namespace="tokens0") # index:2
        original_vocab.add_token_to_namespace("bat", namespace="tokens0")   # index:3
        original_vocab.add_token_to_namespace("cat", namespace="tokens0")   # index:4

        original_vocab.add_token_to_namespace("apple", namespace="tokens1") # index:0
        original_vocab.add_token_to_namespace("bat", namespace="tokens1")   # index:1
        original_vocab.add_token_to_namespace("cat", namespace="tokens1")   # index:2

        original_vocab.add_token_to_namespace("a", namespace="tokens2") # index:0
        original_vocab.add_token_to_namespace("b", namespace="tokens2") # index:1
        original_vocab.add_token_to_namespace("c", namespace="tokens2") # index:2

        original_vocab.add_token_to_namespace("p", namespace="tokens3") # index:0
        original_vocab.add_token_to_namespace("q", namespace="tokens3") # index:1

        original_vocab.save_to_files(vocab_dir)

        text_field0 = TextField([Token(t) for t in ["cat", "an", "apple", "banana", "atom", "bat"]],
                                {"tokens0": SingleIdTokenIndexer("tokens0")})
        text_field1 = TextField([Token(t) for t in ["cat", "an", "apple", "banana", "atom", "bat"]],
                                {"tokens1": SingleIdTokenIndexer("tokens1")})
        text_field4 = TextField([Token(t) for t in ["l", "m", "n", "o"]],
                                {"tokens4": SingleIdTokenIndexer("tokens4")})
        text_field5 = TextField([Token(t) for t in ["x", "y", "z"]],
                                {"tokens5": SingleIdTokenIndexer("tokens5")})
        instances = Batch([Instance({"text0": text_field0, "text1": text_field1,
                                     "text4": text_field4, "text5": text_field5})])

        params = Params({"directory_path": vocab_dir,
                         "extend": True,
                         "non_padded_namespaces": ["tokens1", "tokens5"]})
        extended_vocab = Vocabulary.from_params(params, instances)

        # namespaces: tokens0, tokens1 is common.
        # tokens2, tokens3 only vocab has. tokens4, tokens5 only instances
        extended_namespaces = {*extended_vocab._token_to_index}
        assert extended_namespaces == {"tokens{}".format(i) for i in range(6)}

        # # Check that _non_padded_namespaces list is consistent after extension
        assert extended_vocab._non_padded_namespaces == {"tokens1", "tokens3", "tokens5"}

        # # original_vocab["tokens1"] has 3 tokens, instances of "tokens1" ns has 5 tokens. 2 overlapping
        assert extended_vocab.get_vocab_size("tokens1") == 6
        assert extended_vocab.get_vocab_size("tokens0") == 8 # 2 extra overlapping because padded

        # namespace tokens3, tokens4 was only in original_vocab,
        # and its token count should be same in extended_vocab
        assert extended_vocab.get_vocab_size("tokens2") == original_vocab.get_vocab_size("tokens2")
        assert extended_vocab.get_vocab_size("tokens3") == original_vocab.get_vocab_size("tokens3")

        # namespace tokens2 was only in instances,
        # and its token count should be same in extended_vocab
        assert extended_vocab.get_vocab_size("tokens4") == 6 # l,m,n,o + oov + padding
        assert extended_vocab.get_vocab_size("tokens5") == 3 # x,y,z

        # Word2index mapping of all words in all namespaces of original_vocab
        # should be maintained in extended_vocab
        for namespace, token2index in original_vocab._token_to_index.items():
            for token, _ in token2index.items():
                vocab_index = original_vocab.get_token_index(token, namespace)
                extended_vocab_index = extended_vocab.get_token_index(token, namespace)
                assert vocab_index == extended_vocab_index
        # And same for Index2Word mapping
        for namespace, index2token in original_vocab._index_to_token.items():
            for index, _ in index2token.items():
                vocab_token = original_vocab.get_token_from_index(index, namespace)
                extended_vocab_token = extended_vocab.get_token_from_index(index, namespace)
                assert vocab_token == extended_vocab_token