示例#1
0
 def test_crashes_with_empty_feature_value_and_no_default(self):
     tokenizer = SpacyTokenizer(parse=True)
     tokens = tokenizer.tokenize("This is a sentence.")
     tokens = [t for t in tokens] + [Token("</S>")]
     vocab = Vocabulary()
     vocab.add_token_to_namespace("ROOT", namespace="dep_labels")
     vocab.add_token_to_namespace("NONE", namespace="dep_labels")
     indexer = SingleIdTokenIndexer(namespace="dep_labels",
                                    feature_name="dep_")
     with pytest.raises(ValueError):
         indexer.tokens_to_indices([tokens[-1]], vocab)
示例#2
0
 def test_tokens_to_indices_with_non_default_feature_name(self):
     tokenizer = SpacyTokenizer(parse=True)
     tokens = tokenizer.tokenize("This is a sentence.")
     tokens = [t for t in tokens] + [Token("</S>")]
     vocab = Vocabulary()
     root_index = vocab.add_token_to_namespace("ROOT",
                                               namespace="dep_labels")
     none_index = vocab.add_token_to_namespace("NONE",
                                               namespace="dep_labels")
     indexer = SingleIdTokenIndexer(namespace="dep_labels",
                                    feature_name="dep_",
                                    default_value="NONE")
     assert indexer.tokens_to_indices([tokens[1]], vocab) == {
         "tokens": [root_index]
     }
     assert indexer.tokens_to_indices([tokens[-1]], vocab) == {
         "tokens": [none_index]
     }
示例#3
0
class FasttextTokenIndexer(TokenIndexer[int]):
    def __init__(self,
                 model_path,
                 namespace: str = 'tokens',
                 lowercase_tokens: bool = False,
                 model_params_path=None):
        self.model_path = model_path
        self.model_params_path = model_params_path or self.get_params_path(
            model_path)
        self.hash_params = {}
        self.vocab = {}
        self.num_vectors = 0

        self.single_id_indexer = SingleIdTokenIndexer(
            namespace,
            lowercase_tokens)  # ToDo: Add start and end tokens params

        if os.path.exists(self.model_params_path):
            # Assume weights will be loaded later
            self.load_saved_params(self.model_params_path)
        else:
            self.load_ft_model(model_path)

    @classmethod
    def get_params_path(cls, model_path):
        return model_path + '.params'

    def load_saved_params(self, model_param_path):
        with open(model_param_path, encoding="utf-8") as fd:
            ft_params = json.load(fd)
            self.hash_params = ft_params['hash_params']
            self.vocab = ft_params['vocab']

    def load_ft_model(self, model_path):
        self.model_params_path = self.get_params_path(model_path)
        ft = load_fasttext_model(model_path)

        self.hash_params = {
            "minn": ft.min_n,
            "maxn": ft.max_n,
            "num_buckets": ft.bucket,
            "fb_compatible": ft.compatible_hash,
        }

        self.vocab = dict(
            (word, keydvector.index) for word, keydvector in ft.vocab.items())

        with open(self.model_params_path, 'w', encoding="utf-8") as out:
            json.dump(
                {
                    'dimensions': ft.vector_size,
                    'hash_params': self.hash_params,
                    'vocab': self.vocab,
                },
                out,
                ensure_ascii=False,
                indent=2)

    def words_to_indexes(self, words):
        words_ngram_ids = []
        word_lengths = []
        mask = []
        for word in words:
            ngram_ids = self.get_ngram_ids(word)
            words_ngram_ids += ngram_ids
            mask += [1] * len(ngram_ids)
            word_lengths.append(len(ngram_ids))

        return words_ngram_ids, word_lengths, mask

    def get_ngram_ids(self, word):
        if word in self.vocab:
            return [self.vocab[word]]
        res = []
        for ngram_id in ft_ngram_hashes(word, **self.hash_params):
            res.append(ngram_id + len(self.vocab))

        return res

    def count_vocab_items(self, token: Token, counter: Dict[str, Dict[str,
                                                                      int]]):
        return self.single_id_indexer.count_vocab_items(token, counter)

    def tokens_to_indices(self, tokens: List[Token], vocabulary: Vocabulary,
                          index_name: str) -> Dict[str, List[TokenType]]:
        words = [token.text for token in tokens]
        word_ngram_ids, word_lengths, mask = self.words_to_indexes(words)

        return {
            f"{index_name}-ngram":
            word_ngram_ids,
            f"{index_name}-ngram-lengths":
            word_lengths,
            f"{index_name}-ngram-mask":
            mask,
            **self.single_id_indexer.tokens_to_indices(tokens, vocabulary, index_name)
        }

    def get_padding_token(self) -> TokenType:
        return 0

    def get_padding_lengths(self, token: TokenType) -> Dict[str, int]:
        return {}

    def as_padded_tensor(
            self, tokens: Dict[str,
                               List[TokenType]], desired_num_tokens: Dict[str,
                                                                          int],
            padding_lengths: Dict[str, int]) -> Dict[str, torch.Tensor]:

        padded = {
            key: pad_sequence_to_length(val, desired_num_tokens[key])
            for key, val in tokens.items()
        }
        return {key: torch.LongTensor(array) for key, array in padded.items()}
示例#4
0
 def test_no_namespace_means_no_indexing(self):
     indexer = SingleIdTokenIndexer(namespace=None, feature_name="text_id")
     assert indexer.tokens_to_indices([Token(text_id=23)], None) == {
         "tokens": [23]
     }
示例#5
0
class UnbiasedTopicalExtractorGloVe(nn.Module):
    def __init__(self, device, vocab_dir):
        super(UnbiasedTopicalExtractorGloVe, self).__init__()
        self.device = device
        vocab = Vocabulary()
        self.vocab = vocab.from_files(vocab_dir)
        self.embedding = Embedding(
            embedding_dim=300,
            trainable=True,
            num_embeddings=self.vocab.get_vocab_size("tokens"),
            pretrained_file=
            "https://allennlp.s3.amazonaws.com/datasets/glove/glove.840B.300d.txt.gz"
        )
        self.tokenizer = WordTokenizer()
        self.token_indexer = SingleIdTokenIndexer(lowercase_tokens=True)
        self.rnn_input_dropout = InputVariationalDropout(0.5)
        self.encoder = _Seq2SeqWrapper(torch.nn.LSTM)(input_size=300,
                                                      hidden_size=300,
                                                      num_layers=1,
                                                      bidirectional=True)

        self._feature_feedforward_layer = torch.nn.Linear(600, 300)
        self._feature_feedforward_dropout = torch.nn.Dropout(0.5)
        self._feature_feedforward_activation = torch.nn.ReLU()
        self._class_classification_layer = torch.nn.Linear(300, 2)
        self._group_classification_layer = torch.nn.Linear(300, 2)
        init.xavier_uniform_(self._feature_feedforward_layer.weight)
        init.zeros_(self._feature_feedforward_layer.bias)
        init.xavier_uniform_(self._class_classification_layer.weight)
        init.zeros_(self._class_classification_layer.bias)
        init.xavier_uniform_(self._group_classification_layer.weight)
        init.zeros_(self._group_classification_layer.bias)
        self._class_loss = torch.nn.CrossEntropyLoss()
        self._group_loss = torch.nn.CrossEntropyLoss()

    def forward(self, alpha, tweets, class_labels=None, group_labels=None):
        batch_indices = []
        batch_masks = []
        max_len = 0
        for tweet in tweets:
            tokens = self.tokenizer.tokenize(tweet)
            indices = self.token_indexer.tokens_to_indices(
                tokens, self.vocab, index_name="tokens")["tokens"]
            max_len = max(len(indices), max_len)
            # The mask has 1 for real tokens and 0 for padding tokens.
            input_mask = [1] * len(indices)
            batch_indices.append(indices)
            batch_masks.append(input_mask)
        for indices, input_mask in zip(batch_indices, batch_masks):
            # Zero-pad up to the max sequence length within the batch.
            padding = [0] * (max_len - len(indices))
            indices += padding
            input_mask += padding
            assert len(indices) == max_len
            assert len(input_mask) == max_len

        batch_indices = torch.tensor(batch_indices).to(self.device)
        mask = torch.tensor(batch_masks).float().to(self.device)
        embedded_tweet = self.embedding(batch_indices)
        embedded_tweet = self.rnn_input_dropout(embedded_tweet)
        # encode tweet, (batch_size, tweet_length, hidden_dim)
        encoded_tweet = self.encoder(embedded_tweet, mask)
        # The pooling layer -- max pooling.
        # (batch_size, model_dim)
        encode_max, _ = replace_masked_values(encoded_tweet,
                                              mask.unsqueeze(-1),
                                              -1e7).max(dim=1)
        feature = self._feature_feedforward_dropout(
            self._feature_feedforward_activation(
                self._feature_feedforward_layer(encode_max)))

        reverse_feature = ReverseLayerF.apply(feature, alpha)
        class_logits = self._class_classification_layer(feature)
        group_logits = self._group_classification_layer(reverse_feature)
        class_probs = torch.nn.functional.softmax(class_logits, dim=-1)
        group_probs = torch.nn.functional.softmax(group_logits, dim=-1)
        output_dict = {
            "class_logits": class_logits,
            "group_logits": group_logits,
            "class_probs": class_probs,
            "group_probs": group_probs
        }
        if class_labels is not None:
            class_loss = self._class_loss(class_logits,
                                          class_labels.long().view(-1))
            output_dict["class_loss"] = class_loss
        if group_labels is not None:
            group_loss = self._group_loss(group_logits,
                                          group_labels.long().view(-1))
            output_dict["group_loss"] = group_loss

        return output_dict