示例#1
0
    def test_build_vocab_mapping(self):

        vocab = Vocabulary({
            'target': {
                'aaa': 1,
                'bbb': 1,
                'ccc': 1,
                'ddd': 1,
                'eee': 1,
            },
            'tokens': {
                '111': 1,
                'aaa': 1,
                '222': 1,
                'bbb': 1,
                'ccc': 1,
                '333': 1,
                'ddd': 1,
                'eee': 1,
            }
        })

        mapping = CandidatesSelector._build_mapping(vocab, 'target', 'tokens')

        print(mapping)

        self.assertEqual(mapping[vocab.get_token_index('ccc', 'target')], vocab.get_token_index('ccc', 'tokens'))
        self.assertNotEqual(vocab.get_token_index('ccc', 'target'), vocab.get_token_index('ccc', 'tokens'))
示例#2
0
    def __init__(self, word_embeddings: TextFieldEmbedder,
                 encoder: Seq2VecEncoder, vocab: Vocabulary) -> None:
        super().__init__(vocab)
        # We need the embeddings to convert word IDs to their vector representations
        self.word_embeddings = word_embeddings

        self.encoder = encoder

        # After converting a sequence of vectors to a single vector, we feed it into
        # a fully-connected linear layer to reduce the dimension to the total number of labels.
        self.linear = torch.nn.Linear(
            in_features=encoder.get_output_dim(),
            out_features=vocab.get_vocab_size('labels'))

        # Monitor the metrics - we use accuracy, as well as prec, rec, f1 for 4 (very positive)
        self.accuracy = CategoricalAccuracy()
        self.f1_measure_positive = F1Measure(
            vocab.get_token_index("positive", "labels"))
        self.f1_measure_negative = F1Measure(
            vocab.get_token_index("negative", "labels"))
        self.f1_measure_neutral = F1Measure(
            vocab.get_token_index("neutral", "labels"))

        # We use the cross entropy loss because this is a classification task.
        # Note that PyTorch's CrossEntropyLoss combines softmax and log likelihood loss,
        # which makes it unnecessary to add a separate softmax layer.
        self.loss_function = torch.nn.CrossEntropyLoss()
示例#3
0
    def test_embedding_constructed_directly_with_pretrained_file(self):

        vocab = Vocabulary()
        vocab.add_token_to_namespace("word")
        vocab.add_token_to_namespace("word2")
        unicode_space = "\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0"
        vocab.add_token_to_namespace(unicode_space)
        embeddings_filename = str(self.TEST_DIR / "embeddings.gz")
        with gzip.open(embeddings_filename, "wb") as embeddings_file:
            embeddings_file.write("word 1.0 2.3 -1.0\n".encode("utf-8"))
            embeddings_file.write(
                f"{unicode_space} 3.4 3.3 5.0\n".encode("utf-8"))

        num_embeddings = vocab.get_vocab_size()
        embedding_layer = Embedding(
            embedding_dim=3,
            num_embeddings=num_embeddings,
            pretrained_file=embeddings_filename,
            vocab=vocab,
        )
        word_vector = embedding_layer.weight.data[vocab.get_token_index(
            "word")]
        assert numpy.allclose(word_vector.numpy(),
                              numpy.array([1.0, 2.3, -1.0]))
        word_vector = embedding_layer.weight.data[vocab.get_token_index(
            unicode_space)]
        assert numpy.allclose(word_vector.numpy(), numpy.array([3.4, 3.3,
                                                                5.0]))
        word_vector = embedding_layer.weight.data[vocab.get_token_index(
            "word2")]
        assert not numpy.allclose(word_vector.numpy(),
                                  numpy.array([1.0, 2.3, -1.0]))
示例#4
0
 def test_embedding_layer_actually_initializes_word_vectors_correctly(self):
     vocab = Vocabulary()
     vocab.add_token_to_namespace("word")
     vocab.add_token_to_namespace("word2")
     unicode_space = "\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0"
     vocab.add_token_to_namespace(unicode_space)
     embeddings_filename = str(self.TEST_DIR / "embeddings.gz")
     with gzip.open(embeddings_filename, "wb") as embeddings_file:
         embeddings_file.write("word 1.0 2.3 -1.0\n".encode("utf-8"))
         embeddings_file.write(
             f"{unicode_space} 3.4 3.3 5.0\n".encode("utf-8"))
     params = Params({
         "pretrained_file": embeddings_filename,
         "embedding_dim": 3
     })
     embedding_layer = Embedding.from_params(params, vocab=vocab)
     word_vector = embedding_layer.weight.data[vocab.get_token_index(
         "word")]
     assert numpy.allclose(word_vector.numpy(),
                           numpy.array([1.0, 2.3, -1.0]))
     word_vector = embedding_layer.weight.data[vocab.get_token_index(
         unicode_space)]
     assert numpy.allclose(word_vector.numpy(), numpy.array([3.4, 3.3,
                                                             5.0]))
     word_vector = embedding_layer.weight.data[vocab.get_token_index(
         "word2")]
     assert not numpy.allclose(word_vector.numpy(),
                               numpy.array([1.0, 2.3, -1.0]))
示例#5
0
 def index(self, vocab: Vocabulary):
     vocab_size = vocab.get_vocab_size()
     ids = {}
     for token in self._source_tokens:
         text = token.text.lower()
         text_ids = vocab.get_token_index(text)
         if text_ids == vocab.get_token_index(DEFAULT_OOV_TOKEN):
             self._out.append(ids.setdefault(text, len(ids) + vocab_size))
         else:
             self._out.append(text_ids)
示例#6
0
    def __init__(self,
                 vocab: Vocabulary,
                 recurrent_dropout_probability: float = 0.0,
                 embedding_dropout_probability: float = 0.0,
                 input_size=512,
                 hidden_size=512) -> None:
        """
        :param options_file: for initializing elmo BiLM
        :param weight_file: for initializing elmo BiLM
        :param requires_grad: Whether or not to finetune the LSTM layers
        :param recurrent_dropout_probability: recurrent dropout to add to LSTM layers
        """
        super(SimpleBiLM, self).__init__()

        self.forward_lm = PytorchSeq2SeqWrapper(StackedLstm(
            input_size=input_size,
            hidden_size=hidden_size,
            num_layers=2,
            go_forward=True,
            recurrent_dropout_probability=recurrent_dropout_probability,
            use_input_projection_bias=False,
            use_highway=True),
                                                stateful=True)
        self.reverse_lm = PytorchSeq2SeqWrapper(StackedLstm(
            input_size=input_size,
            hidden_size=hidden_size,
            num_layers=2,
            go_forward=False,
            recurrent_dropout_probability=recurrent_dropout_probability,
            use_input_projection_bias=False,
            use_highway=True),
                                                stateful=True)

        # This will also be the encoder
        self.decoder = torch.nn.Linear(
            512, vocab.get_vocab_size(namespace='tokens'))

        self.vocab = vocab
        self.register_buffer(
            'eos_tokens',
            torch.LongTensor([
                vocab.get_token_index(tok) for tok in [
                    '.', '!', '?', '@@UNKNOWN@@', '@@PADDING@@', '@@bos@@',
                    '@@eos@@'
                ]
            ]))
        self.register_buffer(
            'invalid_tokens',
            torch.LongTensor([
                vocab.get_token_index(tok) for tok in [
                    '@@UNKNOWN@@', '@@PADDING@@', '@@bos@@', '@@eos@@',
                    '@@NEWLINE@@'
                ]
            ]))
        self.embedding_dropout_probability = embedding_dropout_probability
示例#7
0
 def generate_ids_out(vocab: Vocabulary, source_tokens: List[Token]):
     vocab_size = vocab.get_vocab_size()
     ids = {}
     out = []
     for token in source_tokens:
         text = token.text.lower()
         text_ids = vocab.get_token_index(text)
         if text_ids == vocab.get_token_index(DEFAULT_OOV_TOKEN):
             out.append(ids.setdefault(text, len(ids) + vocab_size))
         else:
             out.append(text_ids)
     return ids, out
示例#8
0
    def __init__(
        self,
        vocabulary: Vocabulary,
        image_feature_size: int,
        embedding_size: int,
        hidden_size: int,
        attention_projection_size: int,
        constraint,
        max_caption_length: int = 20,
        beam_size: int = 1,
    ) -> None:
        super().__init__()
        self._vocabulary = vocabulary

        self.image_feature_size = image_feature_size
        self.embedding_size = embedding_size
        self.hidden_size = hidden_size
        self.attention_projection_size = attention_projection_size

        # Short hand variable names for convenience
        self.vocab_size = vocabulary.get_vocab_size()
        self._pad_index = vocabulary.get_token_index("@@UNKNOWN@@")
        self._boundary_index = vocabulary.get_token_index("@@BOUNDARY@@")

        self._embedding_layer = nn.Embedding(self.vocab_size,
                                             embedding_size,
                                             padding_idx=self._pad_index)

        self._updown_cell = UpDownCell(image_feature_size, embedding_size,
                                       hidden_size, attention_projection_size)

        self.to_glove = nn.Linear(hidden_size, self.embedding_size)
        self._output_layer = nn.Linear(self.embedding_size,
                                       self.vocab_size,
                                       bias=False)
        self._log_softmax = nn.LogSoftmax(dim=1)

        # We use beam search to find the most likely caption during inference.
        self._beam_size = beam_size
        self._beam_search = ConstraintBeamSearch(
            self._boundary_index,
            max_steps=max_caption_length,
            beam_size=beam_size,
            per_node_beam_size=beam_size // 2,
        )
        self._fc = constraint
        self._beam_search.update_parameter(self._fc.select_state_func)

        self._max_caption_length = max_caption_length

        self._initialize_glove()
示例#9
0
    def test_read_embedding_file_inside_archive(self):
        token2vec = {
                "think": torch.Tensor([0.143, 0.189, 0.555, 0.361, 0.472]),
                "make": torch.Tensor([0.878, 0.651, 0.044, 0.264, 0.872]),
                "difference": torch.Tensor([0.053, 0.162, 0.671, 0.110, 0.259]),
                "àèìòù": torch.Tensor([1.0, 2.0, 3.0, 4.0, 5.0])
                }
        vocab = Vocabulary()
        for token in token2vec:
            vocab.add_token_to_namespace(token)

        params = Params({
                'pretrained_file': str(self.FIXTURES_ROOT / 'embeddings/multi-file-archive.zip'),
                'embedding_dim': 5
                })
        with pytest.raises(ValueError,
                           match="The archive .*/embeddings/multi-file-archive.zip contains multiple files, "
                                 "so you must select one of the files inside "
                                 "providing a uri of the type: "
                                 "\\(path_or_url_to_archive\\)#path_inside_archive\\."):
            Embedding.from_params(vocab, params)

        for ext in ['.zip', '.tar.gz']:
            archive_path = str(self.FIXTURES_ROOT / 'embeddings/multi-file-archive') + ext
            file_uri = format_embeddings_file_uri(archive_path, 'folder/fake_embeddings.5d.txt')
            params = Params({
                    'pretrained_file': file_uri,
                    'embedding_dim': 5
                    })
            embeddings = Embedding.from_params(vocab, params).weight.data
            for tok, vec in token2vec.items():
                i = vocab.get_token_index(tok)
                assert torch.equal(embeddings[i], vec), 'Problem with format ' + archive_path
示例#10
0
 def test_embedding_layer_actually_initializes_word_vectors_correctly(self):
     vocab = Vocabulary()
     vocab.add_token_to_namespace("word")
     vocab.add_token_to_namespace("word2")
     embeddings_filename = self.TEST_DIR + "embeddings.gz"
     with gzip.open(embeddings_filename, 'wb') as embeddings_file:
         embeddings_file.write("word 1.0 2.3 -1.0\n".encode('utf-8'))
     params = Params({
             'pretrained_file': embeddings_filename,
             'embedding_dim': 3,
             })
     embedding_layer = Embedding.from_params(vocab, params)
     word_vector = embedding_layer.weight.data[vocab.get_token_index("word")]
     assert numpy.allclose(word_vector.numpy(), numpy.array([1.0, 2.3, -1.0]))
     word_vector = embedding_layer.weight.data[vocab.get_token_index("word2")]
     assert not numpy.allclose(word_vector.numpy(), numpy.array([1.0, 2.3, -1.0]))
示例#11
0
    def __init__(self,
                 vocab: Vocabulary,
                 mention_feedforward: FeedForward,
                 feature_size: int,
                 initializer: InitializerApplicator = InitializerApplicator(),
                 regularizer: Optional[RegularizerApplicator] = None) -> None:
        super(NERTagger_Has_None, self).__init__(vocab, regularizer)

        # Number of classes determine the output dimension of the final layer
        self._n_labels = vocab.get_vocab_size('ner_labels')

        # TODO(dwadden) think of a better way to enforce this.
        # Null label is needed to keep track of when calculating the metrics
        null_label = vocab.get_token_index("", "ner_labels")
        assert null_label == 0  # If not, the dummy class won't correspond to the null label.

        self._ner_scorer = torch.nn.Sequential(
            TimeDistributed(mention_feedforward),
            TimeDistributed(
                torch.nn.Linear(mention_feedforward.get_output_dim(),
                                self._n_labels)))

        self._ner_metrics = NERMetrics(self._n_labels, null_label)

        self._loss = torch.nn.CrossEntropyLoss(reduction="sum")

        initializer(self)
示例#12
0
    def test_read_embedding_file_inside_archive(self):
        token2vec = {
                "think": torch.Tensor([0.143, 0.189, 0.555, 0.361, 0.472]),
                "make": torch.Tensor([0.878, 0.651, 0.044, 0.264, 0.872]),
                "difference": torch.Tensor([0.053, 0.162, 0.671, 0.110, 0.259]),
                "àèìòù": torch.Tensor([1.0, 2.0, 3.0, 4.0, 5.0])
                }
        vocab = Vocabulary()
        for token in token2vec:
            vocab.add_token_to_namespace(token)

        params = Params({
                'pretrained_file': str(self.FIXTURES_ROOT / 'embeddings/multi-file-archive.zip'),
                'embedding_dim': 5
                })
        with pytest.raises(ValueError, message="No ValueError when pretrained_file is a multi-file archive"):
            Embedding.from_params(vocab, params)

        for ext in ['.zip', '.tar.gz']:
            archive_path = str(self.FIXTURES_ROOT / 'embeddings/multi-file-archive') + ext
            file_uri = format_embeddings_file_uri(archive_path, 'folder/fake_embeddings.5d.txt')
            params = Params({
                    'pretrained_file': file_uri,
                    'embedding_dim': 5
                    })
            embeddings = Embedding.from_params(vocab, params).weight.data
            for tok, vec in token2vec.items():
                i = vocab.get_token_index(tok)
                assert torch.equal(embeddings[i], vec), 'Problem with format ' + archive_path
示例#13
0
    def __init__(
        self,
        vocabulary: Vocabulary,
        image_features_h5path: str,
        boxes_jsonpath: str,
        wordforms_tsvpath: str,
        hierarchy_jsonpath: str,
        nms_threshold: float = 0.85,
        max_given_constraints: int = 3,
        max_words_per_constraint: int = 3,
        in_memory: bool = True,
    ):
        super().__init__(image_features_h5path, in_memory=in_memory)

        self._vocabulary = vocabulary
        self._pad_index = vocabulary.get_token_index("@@UNKNOWN@@")

        self._boxes_reader = ConstraintBoxesReader(boxes_jsonpath)

        self._constraint_filter = ConstraintFilter(hierarchy_jsonpath,
                                                   nms_threshold,
                                                   max_given_constraints)
        self._fsm_builder = FiniteStateMachineBuilder(
            vocabulary, wordforms_tsvpath, max_given_constraints,
            max_words_per_constraint)
class TestBagOfWordCountsTokenEmbedder(AllenNlpTestCase):
    def setUp(self):
        super(TestBagOfWordCountsTokenEmbedder, self).setUp()
        self.vocab = Vocabulary()
        self.vocab.add_token_to_namespace("1")
        self.vocab.add_token_to_namespace("2")
        self.vocab.add_token_to_namespace("3")
        self.vocab.add_token_to_namespace("4")

    def test_forward_calculates_bow_properly(self):
        params = Params({})
        embedder = BagOfWordCountsTokenEmbedder.from_params(self.vocab,
                                                            params=params)
        numpy_tensor = np.array([[2, 0], [3, 0], [4, 4]])
        inputs = torch.from_numpy(numpy_tensor).unsqueeze(1)
        embedder_output = embedder(inputs)
        numpy_tensor = np.array([[1, 0, 1, 0, 0, 0], [1, 0, 0, 1, 0, 0],
                                 [0, 0, 0, 0, 2, 0]])
        manual_output = torch.from_numpy(numpy_tensor).float()
        assert_almost_equal(embedder_output.data.numpy(),
                            manual_output.data.numpy())

    def test_projects_properly(self):
        params = Params({"projection_dim": 50})
        embedder = BagOfWordCountsTokenEmbedder.from_params(self.vocab,
                                                            params=params)
        numpy_tensor = np.array(
            [self.vocab.get_token_index(x) for x in ["1", "2", "3"]])
        inputs = torch.from_numpy(numpy_tensor).unsqueeze(1)
        embedder_output = embedder(inputs)
        assert embedder_output.shape[1] == 50
示例#15
0
    def test_read_embedding_file_inside_archive(self):
        token2vec = {
                u"think": torch.Tensor([0.143, 0.189, 0.555, 0.361, 0.472]),
                u"make": torch.Tensor([0.878, 0.651, 0.044, 0.264, 0.872]),
                u"difference": torch.Tensor([0.053, 0.162, 0.671, 0.110, 0.259]),
                u"àèìòù": torch.Tensor([1.0, 2.0, 3.0, 4.0, 5.0])
                }
        vocab = Vocabulary()
        for token in token2vec:
            vocab.add_token_to_namespace(token)

        params = Params({
                u'pretrained_file': unicode(self.FIXTURES_ROOT / u'embeddings/multi-file-archive.zip'),
                u'embedding_dim': 5
                })
        with pytest.raises(ValueError, message=u"No ValueError when pretrained_file is a multi-file archive"):
            Embedding.from_params(vocab, params)

        for ext in [u'.zip', u'.tar.gz']:
            archive_path = unicode(self.FIXTURES_ROOT / u'embeddings/multi-file-archive') + ext
            file_uri = format_embeddings_file_uri(archive_path, u'folder/fake_embeddings.5d.txt')
            params = Params({
                    u'pretrained_file': file_uri,
                    u'embedding_dim': 5
                    })
            embeddings = Embedding.from_params(vocab, params).weight.data
            for tok, vec in list(token2vec.items()):
                i = vocab.get_token_index(tok)
                assert torch.equal(embeddings[i], vec), u'Problem with format ' + archive_path
示例#16
0
    def __init__(self,
                 vocab: Vocabulary,
                 text_field_embedder: TextFieldEmbedder,
                 seq2vec_encoder: Seq2VecEncoder,
                 seq2seq_encoder: Optional[Seq2SeqEncoder] = None,
                 feedforward: Optional[FeedForward] = None,
                 dropout: float = 0.0,
                 do_batch_norm: bool = False) -> None:
        super(QuizGuesser, self).__init__(vocab)
        self.text_field_embedder = text_field_embedder
        self.seq2seq_encoder = seq2seq_encoder
        self.seq2vec_encoder = seq2vec_encoder
        self.feedforward = feedforward

        if self.feedforward is not None:
            entity_embedding_dim = self.feedforward.get_output_dim()
        else:
            entity_embedding_dim = self.seq2vec_encoder.get_output_dim()

        num_entities = vocab.get_vocab_size('entities')
        self.entity_embedder = Embedding(entity_embedding_dim,
                                         num_entities,
                                         vocab_namespace='entities',
                                         padding_index=vocab.get_token_index(
                                             vocab._oov_token,
                                             namespace='entities'))

        self.dropout = nn.Dropout(dropout)
        if do_batch_norm:
            self.batch_norm = nn.BatchNorm1d(num_entities)

        self.accuracy = CategoricalAccuracy(top_k=1)
        self.mean_reciprocal_rank = MeanReciprocalRank()
示例#17
0
    def __init__(self,
                 vocab: Vocabulary,
                 beam_size: int,
                 namespace: str = 'tokens',
                 end_symbol: str = None,
                 min_steps: int = None,
                 max_steps: int = 50,
                 per_node_beam_size: int = None,
                 disallow_repeated_ngrams: int = None,
                 repeated_ngrams_exceptions: List[str] = None,
                 length_penalizer: LengthPenalizer = None,
                 coverage_penalizer: CoveragePenalizer = None) -> None:
        self.beam_size = beam_size
        end_symbol = end_symbol or END_SYMBOL
        self._end_index = vocab.get_token_index(end_symbol, namespace)
        self.max_steps = max_steps
        self.min_steps = min_steps
        self.per_node_beam_size = per_node_beam_size or beam_size
        self.length_penalizer = length_penalizer
        self.coverage_penalizer = coverage_penalizer

        # Convert the token exceptions to their indexes
        self.disallow_repeated_ngrams = disallow_repeated_ngrams
        self.repeated_ngrams_exceptions = set()
        repeated_ngrams_exceptions = repeated_ngrams_exceptions or []
        token_to_index = vocab.get_token_to_index_vocabulary(namespace)
        for token in repeated_ngrams_exceptions:
            if token not in token_to_index:
                raise Exception(f'Could not add token exception {token} because {token} is not in the vocabulary')
            self.repeated_ngrams_exceptions.add(token_to_index[token])
示例#18
0
文件: tag_f1.py 项目: AkshatSh/DPD
 def __init__(self, vocab: Vocabulary, class_labels: List[str]) -> None:
     self.class_labels = class_labels
     positive_labels = []
     for class_label in class_labels:
         positive_labels.append(vocab.get_token_index(class_label, namespace='labels'))
     
     self._pos_labels = set(positive_labels)
     super(TagF1, self).__init__(positive_label=1)
示例#19
0
 def index(self, vocab: Vocabulary):
     source_ids, _ = CopyField.generate_ids_out(vocab, self._source_tokens)
     for token in self._target_tokens:
         text = token.text.lower()
         if text in source_ids:
             self._out.append(source_ids[text])
         else:
             self._out.append(vocab.get_token_index(text))
示例#20
0
 def tokens_to_indices(self, tokens: List[Token],
                       vocabulary: Vocabulary,
                       index_name: str) -> Dict[str, List[int]]: # pylint: disable=unused-argument
     return {
             "token_ids": [10, 15] + \
                      [vocabulary.get_token_index(token.text, 'words') for token in tokens] + \
                      [25],
             "additional_key": [22, 29]
     }
示例#21
0
    def __init__(self,
                 vocab: Vocabulary,
                 text_field_embedder: TextFieldEmbedder,
                 encoder: Seq2SeqEncoder,
                 pooler: Seq2VecEncoder,
                 *,
                 logits_threshold: float = 0,
                 softmax: bool = False,
                 gamma: float = 0,
                 beta: float = 0,
                 initializer: InitializerApplicator = InitializerApplicator(),
                 report_confusion_matrix: bool = False,
                 report_samplewise: bool = False,
                 **kwargs) -> None:
        super().__init__(vocab,
                         text_field_embedder=text_field_embedder,
                         encoder=encoder,
                         **kwargs)

        self._pooler = pooler
        self._logits_threshold = logits_threshold
        self._softmax = softmax

        class_statistics = torch.ones(self._num_trigger_classes)
        vocab_counter: Dict[str, Dict[str,
                                      int]] = getattr(vocab,
                                                      "_retained_counter", {})
        label_counter = vocab_counter.get(self._trigger_label_namespace)
        if label_counter is not None:
            for label, rank in label_counter.items():
                idx = vocab.get_token_index(
                    label, namespace=self._trigger_label_namespace)
                class_statistics[idx] = rank
            logger.info(f'Class statistics: {class_statistics}')
        else:
            logger.info('The vocab counter is not retained.')

        if softmax:
            self._loss = ClassBalancedFocalLoss(
                CrossEntropyLoss,
                gamma=gamma,
                beta=beta,
                class_statistics=class_statistics)
        else:
            self._loss = ClassBalancedFocalLoss(
                BCEWithLogitsLoss,
                gamma=gamma,
                beta=beta,
                class_statistics=class_statistics[..., 1:])

        self._report_confusion_matrix = report_confusion_matrix
        self._report_samplewise = report_samplewise

        initializer(self)

        self.metrics: Dict[str, Dict[str,
                                     Metric]] = defaultdict(self._init_metrics)
示例#22
0
 def tokens_to_indices(self, tokens: List[Token],
                       vocabulary: Vocabulary) -> Dict[str, List[int]]:
     return {
         "token_ids": ([10, 15] + [
             vocabulary.get_token_index(token.text, "words")
             for token in tokens
         ] + [25]),
         "additional_key": [22, 29],
     }
示例#23
0
 def tokens_to_indices(self, tokens: List[Token],
                       vocabulary: Vocabulary,
                       index_name: str) -> Dict[str, List[int]]: # pylint: disable=unused-argument
     return {
             "token_ids": [10, 15] + \
                      [vocabulary.get_token_index(token.text, 'words') for token in tokens] + \
                      [25],
             "additional_key": [22, 29]
     }
示例#24
0
    def __init__(self, vocab: Vocabulary, class_labels: List[str]) -> None:
        self.class_labels = class_labels
        positive_labels = {}
        for class_label in class_labels:
            cls_index = vocab.get_token_index(class_label, namespace='labels')
            positive_labels[class_label] = (cls_index,
                                            F1Measure(
                                                positive_label=cls_index))

        self._postiive_labels = positive_labels
 def _get_vocab_index_mapping(self, archived_vocab: Vocabulary) -> List[Tuple[int, int]]:
     vocab_index_mapping: List[Tuple[int, int]] = []
     for index in range(self.vocab.get_vocab_size(namespace='tokens')):
         token = self.vocab.get_token_from_index(index=index, namespace='tokens')
         archived_token_index = archived_vocab.get_token_index(token, namespace='tokens')
         # Checking if we got the UNK token index, because we don't want all new token
         # representations initialized to UNK token's representation. We do that by checking if
         # the two tokens are the same. They will not be if the token at the archived index is
         # UNK.
         if archived_vocab.get_token_from_index(archived_token_index, namespace="tokens") == token:
             vocab_index_mapping.append((index, archived_token_index))
     return vocab_index_mapping
示例#26
0
 def __init__(self,
              vocab: Vocabulary,
              beam_size: int,
              namespace: str = 'tokens',
              end_symbol: str = None,
              max_steps: int = 500,
              per_node_beam_size: int = None) -> None:
     self.beam_size = beam_size
     end_symbol = end_symbol or END_SYMBOL
     self._end_index = vocab.get_token_index(end_symbol, namespace)
     self.max_steps = max_steps
     self.per_node_beam_size = per_node_beam_size or beam_size
示例#27
0
 def _get_vocab_index_mapping(self, archived_vocab: Vocabulary) -> List[Tuple[int, int]]:
     vocab_index_mapping: List[Tuple[int, int]] = []
     for index in range(self.vocab.get_vocab_size(namespace='tokens')):
         token = self.vocab.get_token_from_index(index=index, namespace='tokens')
         archived_token_index = archived_vocab.get_token_index(token, namespace='tokens')
         # Checking if we got the UNK token index, because we don't want all new token
         # representations initialized to UNK token's representation. We do that by checking if
         # the two tokens are the same. They will not be if the token at the archived index is
         # UNK.
         if archived_vocab.get_token_from_index(archived_token_index, namespace="tokens") == token:
             vocab_index_mapping.append((index, archived_token_index))
     return vocab_index_mapping
示例#28
0
    def __init__(
        self,
        vocab: Vocabulary,
        text_field_embedder: TextFieldEmbedder,
        encoder: Seq2SeqEncoder,
        calculate_span_f1: bool = None,
        label_encoding: Optional[str] = None,
        label_namespace: str = "labels",
        verbose_metrics: bool = False,
        initializer: InitializerApplicator = InitializerApplicator(),
        **kwargs,
    ) -> None:
        super().__init__(vocab, **kwargs)

        self.label_namespace = label_namespace
        self.text_field_embedder = text_field_embedder
        self.num_classes = self.vocab.get_vocab_size(label_namespace)
        self.encoder = encoder
        self._verbose_metrics = verbose_metrics
        self.tag_projection_layer = TimeDistributed(
            Linear(self.encoder.get_output_dim(), self.num_classes))

        check_dimensions_match(
            text_field_embedder.get_output_dim(),
            encoder.get_input_dim(),
            "text field embedding dim",
            "encoder input dim",
        )

        # We keep calculate_span_f1 as a constructor argument for API consistency with
        # the CrfTagger, even it is redundant in this class
        # (label_encoding serves the same purpose).
        if calculate_span_f1 and not label_encoding:
            raise ConfigurationError(
                "calculate_span_f1 is True, but no label_encoding was specified."
            )
        self.metrics = {
            "accuracy": CategoricalAccuracy(),
            "accuracy3": CategoricalAccuracy(top_k=3),
        }

        if calculate_span_f1 or label_encoding:
            self._f1_metric = SpanBasedF1Measure(vocab,
                                                 tag_namespace=label_namespace,
                                                 label_encoding=label_encoding)
        else:
            self._f1_metric = None

        initializer(self)

        self.c_acc = CategoricalAccuracy()
        self.c_idx = vocab.get_token_index("c", namespace="labels")
def build_vocab_fixed_labels(labels: list, instances: Iterable[Instance]) -> Vocabulary:
    logger.critical("Building the vocabulary")
    logger.critical("Initializing the labels namespace")
    vocab = Vocabulary()
    indexes = vocab.add_tokens_to_namespace(labels, namespace="labels")
    logger.critical(f"Mapped them\n{labels}\n{indexes}")
    logger.critical("Initializing the regular namespace")
    vocab.extend_from_instances(instances)

    second_indexes = [vocab.get_token_index(token, namespace="labels") for token in labels]
    # indexes = vocab.add_tokens_to_namespace(labels, namespace="labels")
    logger.critical(f"Mapped them\n{labels}\n{second_indexes}")
    return vocab
示例#30
0
 def tokens_to_indices(
         self, tokens: List[data.Token],
         vocabulary: data.Vocabulary) -> data.IndexedTokenList:
     indices: List[List[int]] = []
     vocab_size = vocabulary.get_vocab_size(self.namespace)
     for token in tokens:
         token_indices = []
         feats = self._feat_values(token)
         for feat in feats:
             token_indices.append(
                 vocabulary.get_token_index(feat, self.namespace))
         indices.append(
             util.pad_sequence_to_length(token_indices, vocab_size))
     return {"tokens": indices}
示例#31
0
 def test_get_embedding_layer_initializes_unseen_words_randomly_not_zero(self):
     vocab = Vocabulary()
     vocab.add_token_to_namespace(u"word")
     vocab.add_token_to_namespace(u"word2")
     embeddings_filename = unicode(self.TEST_DIR / u"embeddings.gz")
     with gzip.open(embeddings_filename, u'wb') as embeddings_file:
         embeddings_file.write(u"word 1.0 2.3 -1.0\n".encode(u'utf-8'))
     params = Params({
             u'pretrained_file': embeddings_filename,
             u'embedding_dim': 3,
             })
     embedding_layer = Embedding.from_params(vocab, params)
     word_vector = embedding_layer.weight.data[vocab.get_token_index(u"word2")]
     assert not numpy.allclose(word_vector.numpy(), numpy.array([0.0, 0.0, 0.0]))
示例#32
0
    def __init__(
        self,
        vocabulary: Vocabulary,
        input_size: int = 256,
        hidden_size: int = 128,
        num_layers: int = 2,
        dropout: float = 0.0,
    ):
        super().__init__()
        self._start_index = vocabulary.get_token_index("@start@",
                                                       namespace="programs")
        self._end_index = vocabulary.get_token_index("@end@",
                                                     namespace="programs")
        self._pad_index = vocabulary.get_token_index("@@PADDING@@",
                                                     namespace="programs")
        self._unk_index = vocabulary.get_token_index("@@UNKNOWN@@",
                                                     namespace="programs")

        vocab_size = vocabulary.get_vocab_size(namespace="programs")
        embedder_inner = Embedding(vocab_size,
                                   input_size,
                                   padding_index=self._pad_index)
        self._embedder = BasicTextFieldEmbedder({"programs": embedder_inner})

        self._encoder = PytorchSeq2SeqWrapper(
            nn.LSTM(input_size,
                    hidden_size,
                    num_layers=num_layers,
                    dropout=dropout,
                    batch_first=True))
        # Project and tie input and output embeddings
        self._projection_layer = nn.Linear(hidden_size, input_size, bias=False)
        self._output_layer = nn.Linear(input_size, vocab_size, bias=False)
        self._output_layer.weight = embedder_inner.weight

        # Record average log2 (perplexity) for calculating final perplexity.
        self._log2_perplexity = Average()
    def __init__(self,
                 vocab: Vocabulary,
                 text_field_embedder: TextFieldEmbedder,
                 embedding_dropout: float,
                 encoder: Seq2SeqEncoder,
                 integrator: Seq2SeqEncoder,
                 integrator_dropout: float,
                 output_layer: Union[FeedForward, Maxout],
                 initializer: InitializerApplicator = InitializerApplicator(),
                 regularizer: Optional[RegularizerApplicator] = None) -> None:
        super().__init__(vocab, regularizer)
        # We need the embeddings to convert word IDs to their vector representations
        self.embedding_dropout = torch.nn.Dropout(embedding_dropout)
        self.text_field_embedder = text_field_embedder
        self.encoder = encoder

        self.integrator = integrator
        self.integrator_dropout = torch.nn.Dropout(integrator_dropout)

        self._self_attentive_pooling_projection = torch.nn.Linear(
            self.integrator.get_output_dim(), 1)
        self.output_layer = output_layer

        # Monitor the metrics - we use accuracy, as well as prec, rec, f1 for 4 (very positive)
        self.accuracy = CategoricalAccuracy()
        self.f1_measure_positive = F1Measure(
            vocab.get_token_index("positive", "labels"))
        self.f1_measure_negative = F1Measure(
            vocab.get_token_index("negative", "labels"))
        self.f1_measure_neutral = F1Measure(
            vocab.get_token_index("neutral", "labels"))

        # We use the cross entropy loss because this is a classification task.
        # Note that PyTorch's CrossEntropyLoss combines softmax and log likelihood loss,
        # which makes it unnecessary to add a separate softmax layer.
        self.loss_function = torch.nn.CrossEntropyLoss()
        initializer(self)
示例#34
0
 def test_blank_pos_tag(self):
     tokens = [Token(token) for token in "allennlp is awesome .".split(" ")]
     for token in tokens:
         token.pos_ = ""
     indexer = PosTagIndexer()
     counter = defaultdict(lambda: defaultdict(int))
     for token in tokens:
         indexer.count_vocab_items(token, counter)
     # spacy uses a empty string to indicate "no POS tag"
     # we convert it to "NONE"
     assert counter["pos_tokens"]["NONE"] == 4
     vocab = Vocabulary(counter)
     none_index = vocab.get_token_index('NONE', 'pos_tokens')
     # should raise no exception
     indices = indexer.tokens_to_indices(tokens, vocab, index_name="pos")
     assert {"pos": [none_index, none_index, none_index, none_index]} == indices
class KnowledgeGraphFieldTest(AllenNlpTestCase):
    def setUp(self):
        self.tokenizer = WordTokenizer(SpacyWordSplitter(pos_tags=True))
        self.utterance = self.tokenizer.tokenize("where is mersin?")
        self.token_indexers = {"tokens": SingleIdTokenIndexer("tokens")}

        json = {
                'question': self.utterance,
                'columns': ['Name in English', 'Location in English'],
                'cells': [['Paradeniz', 'Mersin'],
                          ['Lake Gala', 'Edirne']]
                }
        self.graph = TableQuestionKnowledgeGraph.read_from_json(json)
        self.vocab = Vocabulary()
        self.name_index = self.vocab.add_token_to_namespace("name", namespace='tokens')
        self.in_index = self.vocab.add_token_to_namespace("in", namespace='tokens')
        self.english_index = self.vocab.add_token_to_namespace("english", namespace='tokens')
        self.location_index = self.vocab.add_token_to_namespace("location", namespace='tokens')
        self.paradeniz_index = self.vocab.add_token_to_namespace("paradeniz", namespace='tokens')
        self.mersin_index = self.vocab.add_token_to_namespace("mersin", namespace='tokens')
        self.lake_index = self.vocab.add_token_to_namespace("lake", namespace='tokens')
        self.gala_index = self.vocab.add_token_to_namespace("gala", namespace='tokens')
        self.negative_one_index = self.vocab.add_token_to_namespace("-1", namespace='tokens')
        self.zero_index = self.vocab.add_token_to_namespace("0", namespace='tokens')
        self.one_index = self.vocab.add_token_to_namespace("1", namespace='tokens')

        self.oov_index = self.vocab.get_token_index('random OOV string', namespace='tokens')
        self.edirne_index = self.oov_index
        self.field = KnowledgeGraphField(self.graph, self.utterance, self.token_indexers, self.tokenizer)

        super(KnowledgeGraphFieldTest, self).setUp()

    def test_count_vocab_items(self):
        namespace_token_counts = defaultdict(lambda: defaultdict(int))
        self.field.count_vocab_items(namespace_token_counts)

        assert namespace_token_counts["tokens"] == {
                '-1': 1,
                '0': 1,
                '1': 1,
                'name': 1,
                'in': 2,
                'english': 2,
                'location': 1,
                'paradeniz': 1,
                'mersin': 1,
                'lake': 1,
                'gala': 1,
                'edirne': 1,
                }

    def test_index_converts_field_correctly(self):
        # pylint: disable=protected-access
        self.field.index(self.vocab)
        assert self.field._indexed_entity_texts.keys() == {'tokens'}
        # Note that these are sorted by their _identifiers_, not their cell text, so the
        # `fb:row.rows` show up after the `fb:cells`.
        expected_array = [[self.negative_one_index],
                          [self.zero_index],
                          [self.one_index],
                          [self.edirne_index],
                          [self.lake_index, self.gala_index],
                          [self.mersin_index],
                          [self.paradeniz_index],
                          [self.location_index, self.in_index, self.english_index],
                          [self.name_index, self.in_index, self.english_index]]
        assert self.field._indexed_entity_texts['tokens'] == expected_array

    def test_get_padding_lengths_raises_if_not_indexed(self):
        with pytest.raises(AssertionError):
            self.field.get_padding_lengths()

    def test_padding_lengths_are_computed_correctly(self):
        # pylint: disable=protected-access
        self.field.index(self.vocab)
        assert self.field.get_padding_lengths() == {'num_entities': 9, 'num_entity_tokens': 3,
                                                    'num_utterance_tokens': 4}
        self.field._token_indexers['token_characters'] = TokenCharactersIndexer()
        self.field.index(self.vocab)
        assert self.field.get_padding_lengths() == {'num_entities': 9, 'num_entity_tokens': 3,
                                                    'num_utterance_tokens': 4,
                                                    'num_token_characters': 9}

    def test_as_tensor_produces_correct_output(self):
        self.field.index(self.vocab)
        padding_lengths = self.field.get_padding_lengths()
        padding_lengths['num_utterance_tokens'] += 1
        padding_lengths['num_entities'] += 1
        tensor_dict = self.field.as_tensor(padding_lengths)
        assert tensor_dict.keys() == {'text', 'linking'}
        expected_text_tensor = [[self.negative_one_index, 0, 0],
                                [self.zero_index, 0, 0],
                                [self.one_index, 0, 0],
                                [self.edirne_index, 0, 0],
                                [self.lake_index, self.gala_index, 0],
                                [self.mersin_index, 0, 0],
                                [self.paradeniz_index, 0, 0],
                                [self.location_index, self.in_index, self.english_index],
                                [self.name_index, self.in_index, self.english_index],
                                [0, 0, 0]]
        assert_almost_equal(tensor_dict['text']['tokens'].detach().cpu().numpy(), expected_text_tensor)

        linking_tensor = tensor_dict['linking'].detach().cpu().numpy()
        expected_linking_tensor = [[[0, 0, 0, 0, 0, 0, 0, 0, 0, 0],  # -1, "where"
                                    [0, 0, 0, 0, 0, 0, 0, 0, 0, 0],  # -1, "is"
                                    [0, 0, 0, 0, 0, 0, 0, 0, 0, 0],  # -1, "mersin"
                                    [0, 0, 0, 0, 0, -1, 0, 0, 0, 0]],  # -1, "?"
                                   [[0, 0, 0, 0, 0, 0, 0, 0, 0, 0],  # 0, "where"
                                    [0, 0, 0, 0, 0, 0, 0, 0, 0, 0],  # 0, "is"
                                    [0, 0, 0, 0, 0, 0, 0, 0, 0, 0],  # 0, "mersin"
                                    [0, 0, 0, 0, 0, 0, 0, 0, 0, 0]],  # 0, "?"
                                   [[0, 0, 0, 0, 0, 0, 0, 0, 0, 0],  # 1, "where"
                                    [0, 0, 0, 0, 0, 0, 0, 0, 0, 0],  # 1, "is"
                                    [0, 0, 0, 0, 0, 0, 0, 0, 0, 0],  # 1, "mersin"
                                    [0, 0, 0, 0, 0, 0, 0, 0, 0, 0]],  # 1, "?"
                                   [[0, 0, 0, 0, 0, .2, 0, 0, 0, 0],  # fb:cell.edirne, "where"
                                    [0, 0, 0, 0, 0, -1.5, 0, 0, 0, 0],  # fb:cell.edirne, "is"
                                    [0, 0, 0, 0, 0, .1666, 0, 0, 0, 0],  # fb:cell.edirne, "mersin"
                                    [0, 0, 0, 0, 0, -5, 0, 0, 0, 0],  # fb:cell.edirne, "?"
                                    [0, 0, 0, 0, 0, 0, 0, 0, 0, 0]],  # fb:cell.edirne, padding
                                   [[0, 0, 0, 0, 0, -.6, 0, 0, 0, 0],  # fb:cell.lake_gala, "where"
                                    [0, 0, 0, 0, 0, -3.5, 0, 0, 0, 0],  # fb:cell.lake_gala, "is"
                                    [0, 0, 0, 0, 0, -.3333, 0, 0, 0, 0],  # fb:cell.lake_gala, "mersin"
                                    [0, 0, 0, 0, 0, -8, 0, 0, 0, 0],  # fb:cell.lake_gala, "?"
                                    [0, 0, 0, 0, 0, 0, 0, 0, 0, 0]],  # fb:cell.lake_gala, padding
                                   [[0, 0, 0, 0, 0, 0, 0, 0, 0, 0],  # fb:cell.mersin, "where"
                                    [0, 0, 0, 0, 0, -1.5, 0, 0, 0, 0],  # fb:cell.mersin, "is"
                                    [0, 1, 1, 1, 1, 1, 0, 0, 1, 1],  # fb:cell.mersin, "mersin"
                                    [0, 0, 0, 0, 0, -5, 0, 0, 0, 0],  # fb:cell.mersin, "?"
                                    [0, 0, 0, 0, 0, 0, 0, 0, 0, 0]],  # fb:cell.mersin, padding
                                   [[0, 0, 0, 0, 0, -.6, 0, 0, 0, 0],  # fb:cell.paradeniz, "where"
                                    [0, 0, 0, 0, 0, -3, 0, 0, 0, 0],  # fb:cell.paradeniz, "is"
                                    [0, 0, 0, 0, 0, -.1666, 0, 0, 0, 0],  # fb:cell.paradeniz, "mersin"
                                    [0, 0, 0, 0, 0, -8, 0, 0, 0, 0],  # fb:cell.paradeniz, "?"
                                    [0, 0, 0, 0, 0, 0, 0, 0, 0, 0]],  # fb:cell.paradeniz, padding
                                   [[0, 0, 0, 0, 0, -2.6, 0, 0, 0, 0],  # fb:row.row.name_in_english, "where"
                                    [0, 0, 0, 0, 0, -7.5, 0, 0, 0, 0],  # fb:row.row.name_in_english, "is"
                                    [0, 0, 0, 0, 0, -1.8333, 1, 1, 0, 0],  # fb:row.row.name_in_english, "mersin"
                                    [0, 0, 0, 0, 0, -18, 0, 0, 0, 0],  # fb:row.row.name_in_english, "?"
                                    [0, 0, 0, 0, 0, 0, 0, 0, 0, 0]],  # fb:row.row.name_in_english, padding
                                   [[0, 0, 0, 0, 0, -1.6, 0, 0, 0, 0],  # fb:row.row.location_in_english, "where"
                                    [0, 0, 0, 0, 0, -5.5, 0, 0, 0, 0],  # fb:row.row.location_in_english, "is"
                                    [0, 0, 0, 0, 0, -1, 0, 0, 0, 0],  # fb:row.row.location_in_english, "mersin"
                                    [0, 0, 0, 0, 0, -14, 0, 0, 0, 0],  # fb:row.row.location_in_english, "?"
                                    [0, 0, 0, 0, 0, 0, 0, 0, 0, 0]],  # fb:row.row.location_in_english, padding
                                   [[0, 0, 0, 0, 0, 0, 0, 0, 0, 0],  # padding, "where"
                                    [0, 0, 0, 0, 0, 0, 0, 0, 0, 0],  # padding, "is"
                                    [0, 0, 0, 0, 0, 0, 0, 0, 0, 0],  # padding, "mersin"
                                    [0, 0, 0, 0, 0, 0, 0, 0, 0, 0],  # padding, "?"
                                    [0, 0, 0, 0, 0, 0, 0, 0, 0, 0]]]  # padding, padding
        for entity_index, entity_features in enumerate(expected_linking_tensor):
            for question_index, feature_vector in enumerate(entity_features):
                assert_almost_equal(linking_tensor[entity_index, question_index],
                                    feature_vector,
                                    decimal=4,
                                    err_msg=f"{entity_index} {question_index}")

    def test_lemma_feature_extractor(self):
        # pylint: disable=protected-access
        utterance = self.tokenizer.tokenize("Names in English")
        field = KnowledgeGraphField(self.graph, self.utterance, self.token_indexers, self.tokenizer)
        entity = 'fb:row.row.name_in_english'
        lemma_feature = field._contains_lemma_match(entity,
                                                    field._entity_text_map[entity],
                                                    utterance[0],
                                                    0,
                                                    utterance)
        assert lemma_feature == 1

    def test_span_overlap_fraction(self):
        # pylint: disable=protected-access
        utterance = self.tokenizer.tokenize("what is the name in english of mersin?")
        field = KnowledgeGraphField(self.graph, self.utterance, self.token_indexers, self.tokenizer)
        entity = 'fb:row.row.name_in_english'
        entity_text = field._entity_text_map[entity]
        feature_values = [field._span_overlap_fraction(entity, entity_text, token, i, utterance)
                          for i, token in enumerate(utterance)]
        assert feature_values == [0, 0, 0, 1, 1, 1, 0, 0, 0]

    def test_batch_tensors(self):
        self.field.index(self.vocab)
        padding_lengths = self.field.get_padding_lengths()
        tensor_dict1 = self.field.as_tensor(padding_lengths)
        tensor_dict2 = self.field.as_tensor(padding_lengths)
        batched_tensor_dict = self.field.batch_tensors([tensor_dict1, tensor_dict2])
        assert batched_tensor_dict.keys() == {'text', 'linking'}
        expected_single_tensor = [[self.negative_one_index, 0, 0],
                                  [self.zero_index, 0, 0],
                                  [self.one_index, 0, 0],
                                  [self.edirne_index, 0, 0],
                                  [self.lake_index, self.gala_index, 0],
                                  [self.mersin_index, 0, 0],
                                  [self.paradeniz_index, 0, 0],
                                  [self.location_index, self.in_index, self.english_index],
                                  [self.name_index, self.in_index, self.english_index]]
        expected_batched_tensor = [expected_single_tensor, expected_single_tensor]
        assert_almost_equal(batched_tensor_dict['text']['tokens'].detach().cpu().numpy(),
                            expected_batched_tensor)
        expected_linking_tensor = torch.stack([tensor_dict1['linking'], tensor_dict2['linking']])
        assert_almost_equal(batched_tensor_dict['linking'].detach().cpu().numpy(),
                            expected_linking_tensor.detach().cpu().numpy())