def test_build_vocab_mapping(self): vocab = Vocabulary({ 'target': { 'aaa': 1, 'bbb': 1, 'ccc': 1, 'ddd': 1, 'eee': 1, }, 'tokens': { '111': 1, 'aaa': 1, '222': 1, 'bbb': 1, 'ccc': 1, '333': 1, 'ddd': 1, 'eee': 1, } }) mapping = CandidatesSelector._build_mapping(vocab, 'target', 'tokens') print(mapping) self.assertEqual(mapping[vocab.get_token_index('ccc', 'target')], vocab.get_token_index('ccc', 'tokens')) self.assertNotEqual(vocab.get_token_index('ccc', 'target'), vocab.get_token_index('ccc', 'tokens'))
def __init__(self, word_embeddings: TextFieldEmbedder, encoder: Seq2VecEncoder, vocab: Vocabulary) -> None: super().__init__(vocab) # We need the embeddings to convert word IDs to their vector representations self.word_embeddings = word_embeddings self.encoder = encoder # After converting a sequence of vectors to a single vector, we feed it into # a fully-connected linear layer to reduce the dimension to the total number of labels. self.linear = torch.nn.Linear( in_features=encoder.get_output_dim(), out_features=vocab.get_vocab_size('labels')) # Monitor the metrics - we use accuracy, as well as prec, rec, f1 for 4 (very positive) self.accuracy = CategoricalAccuracy() self.f1_measure_positive = F1Measure( vocab.get_token_index("positive", "labels")) self.f1_measure_negative = F1Measure( vocab.get_token_index("negative", "labels")) self.f1_measure_neutral = F1Measure( vocab.get_token_index("neutral", "labels")) # We use the cross entropy loss because this is a classification task. # Note that PyTorch's CrossEntropyLoss combines softmax and log likelihood loss, # which makes it unnecessary to add a separate softmax layer. self.loss_function = torch.nn.CrossEntropyLoss()
def test_embedding_constructed_directly_with_pretrained_file(self): vocab = Vocabulary() vocab.add_token_to_namespace("word") vocab.add_token_to_namespace("word2") unicode_space = "\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0" vocab.add_token_to_namespace(unicode_space) embeddings_filename = str(self.TEST_DIR / "embeddings.gz") with gzip.open(embeddings_filename, "wb") as embeddings_file: embeddings_file.write("word 1.0 2.3 -1.0\n".encode("utf-8")) embeddings_file.write( f"{unicode_space} 3.4 3.3 5.0\n".encode("utf-8")) num_embeddings = vocab.get_vocab_size() embedding_layer = Embedding( embedding_dim=3, num_embeddings=num_embeddings, pretrained_file=embeddings_filename, vocab=vocab, ) word_vector = embedding_layer.weight.data[vocab.get_token_index( "word")] assert numpy.allclose(word_vector.numpy(), numpy.array([1.0, 2.3, -1.0])) word_vector = embedding_layer.weight.data[vocab.get_token_index( unicode_space)] assert numpy.allclose(word_vector.numpy(), numpy.array([3.4, 3.3, 5.0])) word_vector = embedding_layer.weight.data[vocab.get_token_index( "word2")] assert not numpy.allclose(word_vector.numpy(), numpy.array([1.0, 2.3, -1.0]))
def test_embedding_layer_actually_initializes_word_vectors_correctly(self): vocab = Vocabulary() vocab.add_token_to_namespace("word") vocab.add_token_to_namespace("word2") unicode_space = "\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0" vocab.add_token_to_namespace(unicode_space) embeddings_filename = str(self.TEST_DIR / "embeddings.gz") with gzip.open(embeddings_filename, "wb") as embeddings_file: embeddings_file.write("word 1.0 2.3 -1.0\n".encode("utf-8")) embeddings_file.write( f"{unicode_space} 3.4 3.3 5.0\n".encode("utf-8")) params = Params({ "pretrained_file": embeddings_filename, "embedding_dim": 3 }) embedding_layer = Embedding.from_params(params, vocab=vocab) word_vector = embedding_layer.weight.data[vocab.get_token_index( "word")] assert numpy.allclose(word_vector.numpy(), numpy.array([1.0, 2.3, -1.0])) word_vector = embedding_layer.weight.data[vocab.get_token_index( unicode_space)] assert numpy.allclose(word_vector.numpy(), numpy.array([3.4, 3.3, 5.0])) word_vector = embedding_layer.weight.data[vocab.get_token_index( "word2")] assert not numpy.allclose(word_vector.numpy(), numpy.array([1.0, 2.3, -1.0]))
def index(self, vocab: Vocabulary): vocab_size = vocab.get_vocab_size() ids = {} for token in self._source_tokens: text = token.text.lower() text_ids = vocab.get_token_index(text) if text_ids == vocab.get_token_index(DEFAULT_OOV_TOKEN): self._out.append(ids.setdefault(text, len(ids) + vocab_size)) else: self._out.append(text_ids)
def __init__(self, vocab: Vocabulary, recurrent_dropout_probability: float = 0.0, embedding_dropout_probability: float = 0.0, input_size=512, hidden_size=512) -> None: """ :param options_file: for initializing elmo BiLM :param weight_file: for initializing elmo BiLM :param requires_grad: Whether or not to finetune the LSTM layers :param recurrent_dropout_probability: recurrent dropout to add to LSTM layers """ super(SimpleBiLM, self).__init__() self.forward_lm = PytorchSeq2SeqWrapper(StackedLstm( input_size=input_size, hidden_size=hidden_size, num_layers=2, go_forward=True, recurrent_dropout_probability=recurrent_dropout_probability, use_input_projection_bias=False, use_highway=True), stateful=True) self.reverse_lm = PytorchSeq2SeqWrapper(StackedLstm( input_size=input_size, hidden_size=hidden_size, num_layers=2, go_forward=False, recurrent_dropout_probability=recurrent_dropout_probability, use_input_projection_bias=False, use_highway=True), stateful=True) # This will also be the encoder self.decoder = torch.nn.Linear( 512, vocab.get_vocab_size(namespace='tokens')) self.vocab = vocab self.register_buffer( 'eos_tokens', torch.LongTensor([ vocab.get_token_index(tok) for tok in [ '.', '!', '?', '@@UNKNOWN@@', '@@PADDING@@', '@@bos@@', '@@eos@@' ] ])) self.register_buffer( 'invalid_tokens', torch.LongTensor([ vocab.get_token_index(tok) for tok in [ '@@UNKNOWN@@', '@@PADDING@@', '@@bos@@', '@@eos@@', '@@NEWLINE@@' ] ])) self.embedding_dropout_probability = embedding_dropout_probability
def generate_ids_out(vocab: Vocabulary, source_tokens: List[Token]): vocab_size = vocab.get_vocab_size() ids = {} out = [] for token in source_tokens: text = token.text.lower() text_ids = vocab.get_token_index(text) if text_ids == vocab.get_token_index(DEFAULT_OOV_TOKEN): out.append(ids.setdefault(text, len(ids) + vocab_size)) else: out.append(text_ids) return ids, out
def __init__( self, vocabulary: Vocabulary, image_feature_size: int, embedding_size: int, hidden_size: int, attention_projection_size: int, constraint, max_caption_length: int = 20, beam_size: int = 1, ) -> None: super().__init__() self._vocabulary = vocabulary self.image_feature_size = image_feature_size self.embedding_size = embedding_size self.hidden_size = hidden_size self.attention_projection_size = attention_projection_size # Short hand variable names for convenience self.vocab_size = vocabulary.get_vocab_size() self._pad_index = vocabulary.get_token_index("@@UNKNOWN@@") self._boundary_index = vocabulary.get_token_index("@@BOUNDARY@@") self._embedding_layer = nn.Embedding(self.vocab_size, embedding_size, padding_idx=self._pad_index) self._updown_cell = UpDownCell(image_feature_size, embedding_size, hidden_size, attention_projection_size) self.to_glove = nn.Linear(hidden_size, self.embedding_size) self._output_layer = nn.Linear(self.embedding_size, self.vocab_size, bias=False) self._log_softmax = nn.LogSoftmax(dim=1) # We use beam search to find the most likely caption during inference. self._beam_size = beam_size self._beam_search = ConstraintBeamSearch( self._boundary_index, max_steps=max_caption_length, beam_size=beam_size, per_node_beam_size=beam_size // 2, ) self._fc = constraint self._beam_search.update_parameter(self._fc.select_state_func) self._max_caption_length = max_caption_length self._initialize_glove()
def test_read_embedding_file_inside_archive(self): token2vec = { "think": torch.Tensor([0.143, 0.189, 0.555, 0.361, 0.472]), "make": torch.Tensor([0.878, 0.651, 0.044, 0.264, 0.872]), "difference": torch.Tensor([0.053, 0.162, 0.671, 0.110, 0.259]), "àèìòù": torch.Tensor([1.0, 2.0, 3.0, 4.0, 5.0]) } vocab = Vocabulary() for token in token2vec: vocab.add_token_to_namespace(token) params = Params({ 'pretrained_file': str(self.FIXTURES_ROOT / 'embeddings/multi-file-archive.zip'), 'embedding_dim': 5 }) with pytest.raises(ValueError, match="The archive .*/embeddings/multi-file-archive.zip contains multiple files, " "so you must select one of the files inside " "providing a uri of the type: " "\\(path_or_url_to_archive\\)#path_inside_archive\\."): Embedding.from_params(vocab, params) for ext in ['.zip', '.tar.gz']: archive_path = str(self.FIXTURES_ROOT / 'embeddings/multi-file-archive') + ext file_uri = format_embeddings_file_uri(archive_path, 'folder/fake_embeddings.5d.txt') params = Params({ 'pretrained_file': file_uri, 'embedding_dim': 5 }) embeddings = Embedding.from_params(vocab, params).weight.data for tok, vec in token2vec.items(): i = vocab.get_token_index(tok) assert torch.equal(embeddings[i], vec), 'Problem with format ' + archive_path
def test_embedding_layer_actually_initializes_word_vectors_correctly(self): vocab = Vocabulary() vocab.add_token_to_namespace("word") vocab.add_token_to_namespace("word2") embeddings_filename = self.TEST_DIR + "embeddings.gz" with gzip.open(embeddings_filename, 'wb') as embeddings_file: embeddings_file.write("word 1.0 2.3 -1.0\n".encode('utf-8')) params = Params({ 'pretrained_file': embeddings_filename, 'embedding_dim': 3, }) embedding_layer = Embedding.from_params(vocab, params) word_vector = embedding_layer.weight.data[vocab.get_token_index("word")] assert numpy.allclose(word_vector.numpy(), numpy.array([1.0, 2.3, -1.0])) word_vector = embedding_layer.weight.data[vocab.get_token_index("word2")] assert not numpy.allclose(word_vector.numpy(), numpy.array([1.0, 2.3, -1.0]))
def __init__(self, vocab: Vocabulary, mention_feedforward: FeedForward, feature_size: int, initializer: InitializerApplicator = InitializerApplicator(), regularizer: Optional[RegularizerApplicator] = None) -> None: super(NERTagger_Has_None, self).__init__(vocab, regularizer) # Number of classes determine the output dimension of the final layer self._n_labels = vocab.get_vocab_size('ner_labels') # TODO(dwadden) think of a better way to enforce this. # Null label is needed to keep track of when calculating the metrics null_label = vocab.get_token_index("", "ner_labels") assert null_label == 0 # If not, the dummy class won't correspond to the null label. self._ner_scorer = torch.nn.Sequential( TimeDistributed(mention_feedforward), TimeDistributed( torch.nn.Linear(mention_feedforward.get_output_dim(), self._n_labels))) self._ner_metrics = NERMetrics(self._n_labels, null_label) self._loss = torch.nn.CrossEntropyLoss(reduction="sum") initializer(self)
def test_read_embedding_file_inside_archive(self): token2vec = { "think": torch.Tensor([0.143, 0.189, 0.555, 0.361, 0.472]), "make": torch.Tensor([0.878, 0.651, 0.044, 0.264, 0.872]), "difference": torch.Tensor([0.053, 0.162, 0.671, 0.110, 0.259]), "àèìòù": torch.Tensor([1.0, 2.0, 3.0, 4.0, 5.0]) } vocab = Vocabulary() for token in token2vec: vocab.add_token_to_namespace(token) params = Params({ 'pretrained_file': str(self.FIXTURES_ROOT / 'embeddings/multi-file-archive.zip'), 'embedding_dim': 5 }) with pytest.raises(ValueError, message="No ValueError when pretrained_file is a multi-file archive"): Embedding.from_params(vocab, params) for ext in ['.zip', '.tar.gz']: archive_path = str(self.FIXTURES_ROOT / 'embeddings/multi-file-archive') + ext file_uri = format_embeddings_file_uri(archive_path, 'folder/fake_embeddings.5d.txt') params = Params({ 'pretrained_file': file_uri, 'embedding_dim': 5 }) embeddings = Embedding.from_params(vocab, params).weight.data for tok, vec in token2vec.items(): i = vocab.get_token_index(tok) assert torch.equal(embeddings[i], vec), 'Problem with format ' + archive_path
def __init__( self, vocabulary: Vocabulary, image_features_h5path: str, boxes_jsonpath: str, wordforms_tsvpath: str, hierarchy_jsonpath: str, nms_threshold: float = 0.85, max_given_constraints: int = 3, max_words_per_constraint: int = 3, in_memory: bool = True, ): super().__init__(image_features_h5path, in_memory=in_memory) self._vocabulary = vocabulary self._pad_index = vocabulary.get_token_index("@@UNKNOWN@@") self._boxes_reader = ConstraintBoxesReader(boxes_jsonpath) self._constraint_filter = ConstraintFilter(hierarchy_jsonpath, nms_threshold, max_given_constraints) self._fsm_builder = FiniteStateMachineBuilder( vocabulary, wordforms_tsvpath, max_given_constraints, max_words_per_constraint)
class TestBagOfWordCountsTokenEmbedder(AllenNlpTestCase): def setUp(self): super(TestBagOfWordCountsTokenEmbedder, self).setUp() self.vocab = Vocabulary() self.vocab.add_token_to_namespace("1") self.vocab.add_token_to_namespace("2") self.vocab.add_token_to_namespace("3") self.vocab.add_token_to_namespace("4") def test_forward_calculates_bow_properly(self): params = Params({}) embedder = BagOfWordCountsTokenEmbedder.from_params(self.vocab, params=params) numpy_tensor = np.array([[2, 0], [3, 0], [4, 4]]) inputs = torch.from_numpy(numpy_tensor).unsqueeze(1) embedder_output = embedder(inputs) numpy_tensor = np.array([[1, 0, 1, 0, 0, 0], [1, 0, 0, 1, 0, 0], [0, 0, 0, 0, 2, 0]]) manual_output = torch.from_numpy(numpy_tensor).float() assert_almost_equal(embedder_output.data.numpy(), manual_output.data.numpy()) def test_projects_properly(self): params = Params({"projection_dim": 50}) embedder = BagOfWordCountsTokenEmbedder.from_params(self.vocab, params=params) numpy_tensor = np.array( [self.vocab.get_token_index(x) for x in ["1", "2", "3"]]) inputs = torch.from_numpy(numpy_tensor).unsqueeze(1) embedder_output = embedder(inputs) assert embedder_output.shape[1] == 50
def test_read_embedding_file_inside_archive(self): token2vec = { u"think": torch.Tensor([0.143, 0.189, 0.555, 0.361, 0.472]), u"make": torch.Tensor([0.878, 0.651, 0.044, 0.264, 0.872]), u"difference": torch.Tensor([0.053, 0.162, 0.671, 0.110, 0.259]), u"àèìòù": torch.Tensor([1.0, 2.0, 3.0, 4.0, 5.0]) } vocab = Vocabulary() for token in token2vec: vocab.add_token_to_namespace(token) params = Params({ u'pretrained_file': unicode(self.FIXTURES_ROOT / u'embeddings/multi-file-archive.zip'), u'embedding_dim': 5 }) with pytest.raises(ValueError, message=u"No ValueError when pretrained_file is a multi-file archive"): Embedding.from_params(vocab, params) for ext in [u'.zip', u'.tar.gz']: archive_path = unicode(self.FIXTURES_ROOT / u'embeddings/multi-file-archive') + ext file_uri = format_embeddings_file_uri(archive_path, u'folder/fake_embeddings.5d.txt') params = Params({ u'pretrained_file': file_uri, u'embedding_dim': 5 }) embeddings = Embedding.from_params(vocab, params).weight.data for tok, vec in list(token2vec.items()): i = vocab.get_token_index(tok) assert torch.equal(embeddings[i], vec), u'Problem with format ' + archive_path
def __init__(self, vocab: Vocabulary, text_field_embedder: TextFieldEmbedder, seq2vec_encoder: Seq2VecEncoder, seq2seq_encoder: Optional[Seq2SeqEncoder] = None, feedforward: Optional[FeedForward] = None, dropout: float = 0.0, do_batch_norm: bool = False) -> None: super(QuizGuesser, self).__init__(vocab) self.text_field_embedder = text_field_embedder self.seq2seq_encoder = seq2seq_encoder self.seq2vec_encoder = seq2vec_encoder self.feedforward = feedforward if self.feedforward is not None: entity_embedding_dim = self.feedforward.get_output_dim() else: entity_embedding_dim = self.seq2vec_encoder.get_output_dim() num_entities = vocab.get_vocab_size('entities') self.entity_embedder = Embedding(entity_embedding_dim, num_entities, vocab_namespace='entities', padding_index=vocab.get_token_index( vocab._oov_token, namespace='entities')) self.dropout = nn.Dropout(dropout) if do_batch_norm: self.batch_norm = nn.BatchNorm1d(num_entities) self.accuracy = CategoricalAccuracy(top_k=1) self.mean_reciprocal_rank = MeanReciprocalRank()
def __init__(self, vocab: Vocabulary, beam_size: int, namespace: str = 'tokens', end_symbol: str = None, min_steps: int = None, max_steps: int = 50, per_node_beam_size: int = None, disallow_repeated_ngrams: int = None, repeated_ngrams_exceptions: List[str] = None, length_penalizer: LengthPenalizer = None, coverage_penalizer: CoveragePenalizer = None) -> None: self.beam_size = beam_size end_symbol = end_symbol or END_SYMBOL self._end_index = vocab.get_token_index(end_symbol, namespace) self.max_steps = max_steps self.min_steps = min_steps self.per_node_beam_size = per_node_beam_size or beam_size self.length_penalizer = length_penalizer self.coverage_penalizer = coverage_penalizer # Convert the token exceptions to their indexes self.disallow_repeated_ngrams = disallow_repeated_ngrams self.repeated_ngrams_exceptions = set() repeated_ngrams_exceptions = repeated_ngrams_exceptions or [] token_to_index = vocab.get_token_to_index_vocabulary(namespace) for token in repeated_ngrams_exceptions: if token not in token_to_index: raise Exception(f'Could not add token exception {token} because {token} is not in the vocabulary') self.repeated_ngrams_exceptions.add(token_to_index[token])
def __init__(self, vocab: Vocabulary, class_labels: List[str]) -> None: self.class_labels = class_labels positive_labels = [] for class_label in class_labels: positive_labels.append(vocab.get_token_index(class_label, namespace='labels')) self._pos_labels = set(positive_labels) super(TagF1, self).__init__(positive_label=1)
def index(self, vocab: Vocabulary): source_ids, _ = CopyField.generate_ids_out(vocab, self._source_tokens) for token in self._target_tokens: text = token.text.lower() if text in source_ids: self._out.append(source_ids[text]) else: self._out.append(vocab.get_token_index(text))
def tokens_to_indices(self, tokens: List[Token], vocabulary: Vocabulary, index_name: str) -> Dict[str, List[int]]: # pylint: disable=unused-argument return { "token_ids": [10, 15] + \ [vocabulary.get_token_index(token.text, 'words') for token in tokens] + \ [25], "additional_key": [22, 29] }
def __init__(self, vocab: Vocabulary, text_field_embedder: TextFieldEmbedder, encoder: Seq2SeqEncoder, pooler: Seq2VecEncoder, *, logits_threshold: float = 0, softmax: bool = False, gamma: float = 0, beta: float = 0, initializer: InitializerApplicator = InitializerApplicator(), report_confusion_matrix: bool = False, report_samplewise: bool = False, **kwargs) -> None: super().__init__(vocab, text_field_embedder=text_field_embedder, encoder=encoder, **kwargs) self._pooler = pooler self._logits_threshold = logits_threshold self._softmax = softmax class_statistics = torch.ones(self._num_trigger_classes) vocab_counter: Dict[str, Dict[str, int]] = getattr(vocab, "_retained_counter", {}) label_counter = vocab_counter.get(self._trigger_label_namespace) if label_counter is not None: for label, rank in label_counter.items(): idx = vocab.get_token_index( label, namespace=self._trigger_label_namespace) class_statistics[idx] = rank logger.info(f'Class statistics: {class_statistics}') else: logger.info('The vocab counter is not retained.') if softmax: self._loss = ClassBalancedFocalLoss( CrossEntropyLoss, gamma=gamma, beta=beta, class_statistics=class_statistics) else: self._loss = ClassBalancedFocalLoss( BCEWithLogitsLoss, gamma=gamma, beta=beta, class_statistics=class_statistics[..., 1:]) self._report_confusion_matrix = report_confusion_matrix self._report_samplewise = report_samplewise initializer(self) self.metrics: Dict[str, Dict[str, Metric]] = defaultdict(self._init_metrics)
def tokens_to_indices(self, tokens: List[Token], vocabulary: Vocabulary) -> Dict[str, List[int]]: return { "token_ids": ([10, 15] + [ vocabulary.get_token_index(token.text, "words") for token in tokens ] + [25]), "additional_key": [22, 29], }
def tokens_to_indices(self, tokens: List[Token], vocabulary: Vocabulary, index_name: str) -> Dict[str, List[int]]: # pylint: disable=unused-argument return { "token_ids": [10, 15] + \ [vocabulary.get_token_index(token.text, 'words') for token in tokens] + \ [25], "additional_key": [22, 29] }
def __init__(self, vocab: Vocabulary, class_labels: List[str]) -> None: self.class_labels = class_labels positive_labels = {} for class_label in class_labels: cls_index = vocab.get_token_index(class_label, namespace='labels') positive_labels[class_label] = (cls_index, F1Measure( positive_label=cls_index)) self._postiive_labels = positive_labels
def _get_vocab_index_mapping(self, archived_vocab: Vocabulary) -> List[Tuple[int, int]]: vocab_index_mapping: List[Tuple[int, int]] = [] for index in range(self.vocab.get_vocab_size(namespace='tokens')): token = self.vocab.get_token_from_index(index=index, namespace='tokens') archived_token_index = archived_vocab.get_token_index(token, namespace='tokens') # Checking if we got the UNK token index, because we don't want all new token # representations initialized to UNK token's representation. We do that by checking if # the two tokens are the same. They will not be if the token at the archived index is # UNK. if archived_vocab.get_token_from_index(archived_token_index, namespace="tokens") == token: vocab_index_mapping.append((index, archived_token_index)) return vocab_index_mapping
def __init__(self, vocab: Vocabulary, beam_size: int, namespace: str = 'tokens', end_symbol: str = None, max_steps: int = 500, per_node_beam_size: int = None) -> None: self.beam_size = beam_size end_symbol = end_symbol or END_SYMBOL self._end_index = vocab.get_token_index(end_symbol, namespace) self.max_steps = max_steps self.per_node_beam_size = per_node_beam_size or beam_size
def _get_vocab_index_mapping(self, archived_vocab: Vocabulary) -> List[Tuple[int, int]]: vocab_index_mapping: List[Tuple[int, int]] = [] for index in range(self.vocab.get_vocab_size(namespace='tokens')): token = self.vocab.get_token_from_index(index=index, namespace='tokens') archived_token_index = archived_vocab.get_token_index(token, namespace='tokens') # Checking if we got the UNK token index, because we don't want all new token # representations initialized to UNK token's representation. We do that by checking if # the two tokens are the same. They will not be if the token at the archived index is # UNK. if archived_vocab.get_token_from_index(archived_token_index, namespace="tokens") == token: vocab_index_mapping.append((index, archived_token_index)) return vocab_index_mapping
def __init__( self, vocab: Vocabulary, text_field_embedder: TextFieldEmbedder, encoder: Seq2SeqEncoder, calculate_span_f1: bool = None, label_encoding: Optional[str] = None, label_namespace: str = "labels", verbose_metrics: bool = False, initializer: InitializerApplicator = InitializerApplicator(), **kwargs, ) -> None: super().__init__(vocab, **kwargs) self.label_namespace = label_namespace self.text_field_embedder = text_field_embedder self.num_classes = self.vocab.get_vocab_size(label_namespace) self.encoder = encoder self._verbose_metrics = verbose_metrics self.tag_projection_layer = TimeDistributed( Linear(self.encoder.get_output_dim(), self.num_classes)) check_dimensions_match( text_field_embedder.get_output_dim(), encoder.get_input_dim(), "text field embedding dim", "encoder input dim", ) # We keep calculate_span_f1 as a constructor argument for API consistency with # the CrfTagger, even it is redundant in this class # (label_encoding serves the same purpose). if calculate_span_f1 and not label_encoding: raise ConfigurationError( "calculate_span_f1 is True, but no label_encoding was specified." ) self.metrics = { "accuracy": CategoricalAccuracy(), "accuracy3": CategoricalAccuracy(top_k=3), } if calculate_span_f1 or label_encoding: self._f1_metric = SpanBasedF1Measure(vocab, tag_namespace=label_namespace, label_encoding=label_encoding) else: self._f1_metric = None initializer(self) self.c_acc = CategoricalAccuracy() self.c_idx = vocab.get_token_index("c", namespace="labels")
def build_vocab_fixed_labels(labels: list, instances: Iterable[Instance]) -> Vocabulary: logger.critical("Building the vocabulary") logger.critical("Initializing the labels namespace") vocab = Vocabulary() indexes = vocab.add_tokens_to_namespace(labels, namespace="labels") logger.critical(f"Mapped them\n{labels}\n{indexes}") logger.critical("Initializing the regular namespace") vocab.extend_from_instances(instances) second_indexes = [vocab.get_token_index(token, namespace="labels") for token in labels] # indexes = vocab.add_tokens_to_namespace(labels, namespace="labels") logger.critical(f"Mapped them\n{labels}\n{second_indexes}") return vocab
def tokens_to_indices( self, tokens: List[data.Token], vocabulary: data.Vocabulary) -> data.IndexedTokenList: indices: List[List[int]] = [] vocab_size = vocabulary.get_vocab_size(self.namespace) for token in tokens: token_indices = [] feats = self._feat_values(token) for feat in feats: token_indices.append( vocabulary.get_token_index(feat, self.namespace)) indices.append( util.pad_sequence_to_length(token_indices, vocab_size)) return {"tokens": indices}
def test_get_embedding_layer_initializes_unseen_words_randomly_not_zero(self): vocab = Vocabulary() vocab.add_token_to_namespace(u"word") vocab.add_token_to_namespace(u"word2") embeddings_filename = unicode(self.TEST_DIR / u"embeddings.gz") with gzip.open(embeddings_filename, u'wb') as embeddings_file: embeddings_file.write(u"word 1.0 2.3 -1.0\n".encode(u'utf-8')) params = Params({ u'pretrained_file': embeddings_filename, u'embedding_dim': 3, }) embedding_layer = Embedding.from_params(vocab, params) word_vector = embedding_layer.weight.data[vocab.get_token_index(u"word2")] assert not numpy.allclose(word_vector.numpy(), numpy.array([0.0, 0.0, 0.0]))
def __init__( self, vocabulary: Vocabulary, input_size: int = 256, hidden_size: int = 128, num_layers: int = 2, dropout: float = 0.0, ): super().__init__() self._start_index = vocabulary.get_token_index("@start@", namespace="programs") self._end_index = vocabulary.get_token_index("@end@", namespace="programs") self._pad_index = vocabulary.get_token_index("@@PADDING@@", namespace="programs") self._unk_index = vocabulary.get_token_index("@@UNKNOWN@@", namespace="programs") vocab_size = vocabulary.get_vocab_size(namespace="programs") embedder_inner = Embedding(vocab_size, input_size, padding_index=self._pad_index) self._embedder = BasicTextFieldEmbedder({"programs": embedder_inner}) self._encoder = PytorchSeq2SeqWrapper( nn.LSTM(input_size, hidden_size, num_layers=num_layers, dropout=dropout, batch_first=True)) # Project and tie input and output embeddings self._projection_layer = nn.Linear(hidden_size, input_size, bias=False) self._output_layer = nn.Linear(input_size, vocab_size, bias=False) self._output_layer.weight = embedder_inner.weight # Record average log2 (perplexity) for calculating final perplexity. self._log2_perplexity = Average()
def __init__(self, vocab: Vocabulary, text_field_embedder: TextFieldEmbedder, embedding_dropout: float, encoder: Seq2SeqEncoder, integrator: Seq2SeqEncoder, integrator_dropout: float, output_layer: Union[FeedForward, Maxout], initializer: InitializerApplicator = InitializerApplicator(), regularizer: Optional[RegularizerApplicator] = None) -> None: super().__init__(vocab, regularizer) # We need the embeddings to convert word IDs to their vector representations self.embedding_dropout = torch.nn.Dropout(embedding_dropout) self.text_field_embedder = text_field_embedder self.encoder = encoder self.integrator = integrator self.integrator_dropout = torch.nn.Dropout(integrator_dropout) self._self_attentive_pooling_projection = torch.nn.Linear( self.integrator.get_output_dim(), 1) self.output_layer = output_layer # Monitor the metrics - we use accuracy, as well as prec, rec, f1 for 4 (very positive) self.accuracy = CategoricalAccuracy() self.f1_measure_positive = F1Measure( vocab.get_token_index("positive", "labels")) self.f1_measure_negative = F1Measure( vocab.get_token_index("negative", "labels")) self.f1_measure_neutral = F1Measure( vocab.get_token_index("neutral", "labels")) # We use the cross entropy loss because this is a classification task. # Note that PyTorch's CrossEntropyLoss combines softmax and log likelihood loss, # which makes it unnecessary to add a separate softmax layer. self.loss_function = torch.nn.CrossEntropyLoss() initializer(self)
def test_blank_pos_tag(self): tokens = [Token(token) for token in "allennlp is awesome .".split(" ")] for token in tokens: token.pos_ = "" indexer = PosTagIndexer() counter = defaultdict(lambda: defaultdict(int)) for token in tokens: indexer.count_vocab_items(token, counter) # spacy uses a empty string to indicate "no POS tag" # we convert it to "NONE" assert counter["pos_tokens"]["NONE"] == 4 vocab = Vocabulary(counter) none_index = vocab.get_token_index('NONE', 'pos_tokens') # should raise no exception indices = indexer.tokens_to_indices(tokens, vocab, index_name="pos") assert {"pos": [none_index, none_index, none_index, none_index]} == indices
class KnowledgeGraphFieldTest(AllenNlpTestCase): def setUp(self): self.tokenizer = WordTokenizer(SpacyWordSplitter(pos_tags=True)) self.utterance = self.tokenizer.tokenize("where is mersin?") self.token_indexers = {"tokens": SingleIdTokenIndexer("tokens")} json = { 'question': self.utterance, 'columns': ['Name in English', 'Location in English'], 'cells': [['Paradeniz', 'Mersin'], ['Lake Gala', 'Edirne']] } self.graph = TableQuestionKnowledgeGraph.read_from_json(json) self.vocab = Vocabulary() self.name_index = self.vocab.add_token_to_namespace("name", namespace='tokens') self.in_index = self.vocab.add_token_to_namespace("in", namespace='tokens') self.english_index = self.vocab.add_token_to_namespace("english", namespace='tokens') self.location_index = self.vocab.add_token_to_namespace("location", namespace='tokens') self.paradeniz_index = self.vocab.add_token_to_namespace("paradeniz", namespace='tokens') self.mersin_index = self.vocab.add_token_to_namespace("mersin", namespace='tokens') self.lake_index = self.vocab.add_token_to_namespace("lake", namespace='tokens') self.gala_index = self.vocab.add_token_to_namespace("gala", namespace='tokens') self.negative_one_index = self.vocab.add_token_to_namespace("-1", namespace='tokens') self.zero_index = self.vocab.add_token_to_namespace("0", namespace='tokens') self.one_index = self.vocab.add_token_to_namespace("1", namespace='tokens') self.oov_index = self.vocab.get_token_index('random OOV string', namespace='tokens') self.edirne_index = self.oov_index self.field = KnowledgeGraphField(self.graph, self.utterance, self.token_indexers, self.tokenizer) super(KnowledgeGraphFieldTest, self).setUp() def test_count_vocab_items(self): namespace_token_counts = defaultdict(lambda: defaultdict(int)) self.field.count_vocab_items(namespace_token_counts) assert namespace_token_counts["tokens"] == { '-1': 1, '0': 1, '1': 1, 'name': 1, 'in': 2, 'english': 2, 'location': 1, 'paradeniz': 1, 'mersin': 1, 'lake': 1, 'gala': 1, 'edirne': 1, } def test_index_converts_field_correctly(self): # pylint: disable=protected-access self.field.index(self.vocab) assert self.field._indexed_entity_texts.keys() == {'tokens'} # Note that these are sorted by their _identifiers_, not their cell text, so the # `fb:row.rows` show up after the `fb:cells`. expected_array = [[self.negative_one_index], [self.zero_index], [self.one_index], [self.edirne_index], [self.lake_index, self.gala_index], [self.mersin_index], [self.paradeniz_index], [self.location_index, self.in_index, self.english_index], [self.name_index, self.in_index, self.english_index]] assert self.field._indexed_entity_texts['tokens'] == expected_array def test_get_padding_lengths_raises_if_not_indexed(self): with pytest.raises(AssertionError): self.field.get_padding_lengths() def test_padding_lengths_are_computed_correctly(self): # pylint: disable=protected-access self.field.index(self.vocab) assert self.field.get_padding_lengths() == {'num_entities': 9, 'num_entity_tokens': 3, 'num_utterance_tokens': 4} self.field._token_indexers['token_characters'] = TokenCharactersIndexer() self.field.index(self.vocab) assert self.field.get_padding_lengths() == {'num_entities': 9, 'num_entity_tokens': 3, 'num_utterance_tokens': 4, 'num_token_characters': 9} def test_as_tensor_produces_correct_output(self): self.field.index(self.vocab) padding_lengths = self.field.get_padding_lengths() padding_lengths['num_utterance_tokens'] += 1 padding_lengths['num_entities'] += 1 tensor_dict = self.field.as_tensor(padding_lengths) assert tensor_dict.keys() == {'text', 'linking'} expected_text_tensor = [[self.negative_one_index, 0, 0], [self.zero_index, 0, 0], [self.one_index, 0, 0], [self.edirne_index, 0, 0], [self.lake_index, self.gala_index, 0], [self.mersin_index, 0, 0], [self.paradeniz_index, 0, 0], [self.location_index, self.in_index, self.english_index], [self.name_index, self.in_index, self.english_index], [0, 0, 0]] assert_almost_equal(tensor_dict['text']['tokens'].detach().cpu().numpy(), expected_text_tensor) linking_tensor = tensor_dict['linking'].detach().cpu().numpy() expected_linking_tensor = [[[0, 0, 0, 0, 0, 0, 0, 0, 0, 0], # -1, "where" [0, 0, 0, 0, 0, 0, 0, 0, 0, 0], # -1, "is" [0, 0, 0, 0, 0, 0, 0, 0, 0, 0], # -1, "mersin" [0, 0, 0, 0, 0, -1, 0, 0, 0, 0]], # -1, "?" [[0, 0, 0, 0, 0, 0, 0, 0, 0, 0], # 0, "where" [0, 0, 0, 0, 0, 0, 0, 0, 0, 0], # 0, "is" [0, 0, 0, 0, 0, 0, 0, 0, 0, 0], # 0, "mersin" [0, 0, 0, 0, 0, 0, 0, 0, 0, 0]], # 0, "?" [[0, 0, 0, 0, 0, 0, 0, 0, 0, 0], # 1, "where" [0, 0, 0, 0, 0, 0, 0, 0, 0, 0], # 1, "is" [0, 0, 0, 0, 0, 0, 0, 0, 0, 0], # 1, "mersin" [0, 0, 0, 0, 0, 0, 0, 0, 0, 0]], # 1, "?" [[0, 0, 0, 0, 0, .2, 0, 0, 0, 0], # fb:cell.edirne, "where" [0, 0, 0, 0, 0, -1.5, 0, 0, 0, 0], # fb:cell.edirne, "is" [0, 0, 0, 0, 0, .1666, 0, 0, 0, 0], # fb:cell.edirne, "mersin" [0, 0, 0, 0, 0, -5, 0, 0, 0, 0], # fb:cell.edirne, "?" [0, 0, 0, 0, 0, 0, 0, 0, 0, 0]], # fb:cell.edirne, padding [[0, 0, 0, 0, 0, -.6, 0, 0, 0, 0], # fb:cell.lake_gala, "where" [0, 0, 0, 0, 0, -3.5, 0, 0, 0, 0], # fb:cell.lake_gala, "is" [0, 0, 0, 0, 0, -.3333, 0, 0, 0, 0], # fb:cell.lake_gala, "mersin" [0, 0, 0, 0, 0, -8, 0, 0, 0, 0], # fb:cell.lake_gala, "?" [0, 0, 0, 0, 0, 0, 0, 0, 0, 0]], # fb:cell.lake_gala, padding [[0, 0, 0, 0, 0, 0, 0, 0, 0, 0], # fb:cell.mersin, "where" [0, 0, 0, 0, 0, -1.5, 0, 0, 0, 0], # fb:cell.mersin, "is" [0, 1, 1, 1, 1, 1, 0, 0, 1, 1], # fb:cell.mersin, "mersin" [0, 0, 0, 0, 0, -5, 0, 0, 0, 0], # fb:cell.mersin, "?" [0, 0, 0, 0, 0, 0, 0, 0, 0, 0]], # fb:cell.mersin, padding [[0, 0, 0, 0, 0, -.6, 0, 0, 0, 0], # fb:cell.paradeniz, "where" [0, 0, 0, 0, 0, -3, 0, 0, 0, 0], # fb:cell.paradeniz, "is" [0, 0, 0, 0, 0, -.1666, 0, 0, 0, 0], # fb:cell.paradeniz, "mersin" [0, 0, 0, 0, 0, -8, 0, 0, 0, 0], # fb:cell.paradeniz, "?" [0, 0, 0, 0, 0, 0, 0, 0, 0, 0]], # fb:cell.paradeniz, padding [[0, 0, 0, 0, 0, -2.6, 0, 0, 0, 0], # fb:row.row.name_in_english, "where" [0, 0, 0, 0, 0, -7.5, 0, 0, 0, 0], # fb:row.row.name_in_english, "is" [0, 0, 0, 0, 0, -1.8333, 1, 1, 0, 0], # fb:row.row.name_in_english, "mersin" [0, 0, 0, 0, 0, -18, 0, 0, 0, 0], # fb:row.row.name_in_english, "?" [0, 0, 0, 0, 0, 0, 0, 0, 0, 0]], # fb:row.row.name_in_english, padding [[0, 0, 0, 0, 0, -1.6, 0, 0, 0, 0], # fb:row.row.location_in_english, "where" [0, 0, 0, 0, 0, -5.5, 0, 0, 0, 0], # fb:row.row.location_in_english, "is" [0, 0, 0, 0, 0, -1, 0, 0, 0, 0], # fb:row.row.location_in_english, "mersin" [0, 0, 0, 0, 0, -14, 0, 0, 0, 0], # fb:row.row.location_in_english, "?" [0, 0, 0, 0, 0, 0, 0, 0, 0, 0]], # fb:row.row.location_in_english, padding [[0, 0, 0, 0, 0, 0, 0, 0, 0, 0], # padding, "where" [0, 0, 0, 0, 0, 0, 0, 0, 0, 0], # padding, "is" [0, 0, 0, 0, 0, 0, 0, 0, 0, 0], # padding, "mersin" [0, 0, 0, 0, 0, 0, 0, 0, 0, 0], # padding, "?" [0, 0, 0, 0, 0, 0, 0, 0, 0, 0]]] # padding, padding for entity_index, entity_features in enumerate(expected_linking_tensor): for question_index, feature_vector in enumerate(entity_features): assert_almost_equal(linking_tensor[entity_index, question_index], feature_vector, decimal=4, err_msg=f"{entity_index} {question_index}") def test_lemma_feature_extractor(self): # pylint: disable=protected-access utterance = self.tokenizer.tokenize("Names in English") field = KnowledgeGraphField(self.graph, self.utterance, self.token_indexers, self.tokenizer) entity = 'fb:row.row.name_in_english' lemma_feature = field._contains_lemma_match(entity, field._entity_text_map[entity], utterance[0], 0, utterance) assert lemma_feature == 1 def test_span_overlap_fraction(self): # pylint: disable=protected-access utterance = self.tokenizer.tokenize("what is the name in english of mersin?") field = KnowledgeGraphField(self.graph, self.utterance, self.token_indexers, self.tokenizer) entity = 'fb:row.row.name_in_english' entity_text = field._entity_text_map[entity] feature_values = [field._span_overlap_fraction(entity, entity_text, token, i, utterance) for i, token in enumerate(utterance)] assert feature_values == [0, 0, 0, 1, 1, 1, 0, 0, 0] def test_batch_tensors(self): self.field.index(self.vocab) padding_lengths = self.field.get_padding_lengths() tensor_dict1 = self.field.as_tensor(padding_lengths) tensor_dict2 = self.field.as_tensor(padding_lengths) batched_tensor_dict = self.field.batch_tensors([tensor_dict1, tensor_dict2]) assert batched_tensor_dict.keys() == {'text', 'linking'} expected_single_tensor = [[self.negative_one_index, 0, 0], [self.zero_index, 0, 0], [self.one_index, 0, 0], [self.edirne_index, 0, 0], [self.lake_index, self.gala_index, 0], [self.mersin_index, 0, 0], [self.paradeniz_index, 0, 0], [self.location_index, self.in_index, self.english_index], [self.name_index, self.in_index, self.english_index]] expected_batched_tensor = [expected_single_tensor, expected_single_tensor] assert_almost_equal(batched_tensor_dict['text']['tokens'].detach().cpu().numpy(), expected_batched_tensor) expected_linking_tensor = torch.stack([tensor_dict1['linking'], tensor_dict2['linking']]) assert_almost_equal(batched_tensor_dict['linking'].detach().cpu().numpy(), expected_linking_tensor.detach().cpu().numpy())