def test_tok2idx_unique_tokens(self): "tok2idx with unique tokens" tokens = [ ["This", "is", "a", "sentence", "."], ["Over", "there", "!"], ] tok2idx = utils.tok2idx(tokens) # Ensure all tokens are in returned dict for sentence in tokens: for token in sentence: self.assertIn(token, tok2idx) # Ensure indices are contiguous idx = set(tok2idx.values()) for i in range(len(tokens[0]) + len(tokens[1])): self.assertIn(i, idx)
def test_idx2tok(self): "idx2tok" tokens = [ ["This", "is", "a", "sentence", "."], ["Over", "there", "!"], ] tok2idx = utils.tok2idx(tokens) idx2tok = utils.idx2tok(tok2idx) # Ensure indices are contiguous idx = set(idx2tok.keys()) for i in range(len(tokens[0]) + len(tokens[1])): self.assertIn(i, idx) all_tokens = set(idx2tok.values()) for sentence in tokens: for token in sentence: self.assertIn(token, all_tokens)