def test_tokenize_detokenize_sentencepiece(tmpdir, model_path): texts = ["a b c", "a ab c", "a b ac"] # Model should be trained if model_path is not None: model_path = Path(tmpdir) / model_path tokens = tokenize(TokenizeMethod.SENTENCEPIECE, texts, model_path=model_path, vocab_size=7) # Control sequence indicating whitespace _ = "▁" expected_tokens = [ [_, "a", _, "b", _, "c"], [_, "a", _, "a", "b", _, "c"], [_, "a", _, "b", _, "a", "c"], ] assert tokens == expected_tokens # Can't detokenize if we didn't give a persistent model path to the tokenize # function if model_path is not None: assert detokenize(TokenizeMethod.SENTENCEPIECE, tokens, model_path) == texts # Previously should be reused with the old vocab size, and a new model # shouldn't be trained tokens = tokenize(TokenizeMethod.SENTENCEPIECE, texts, model_path=model_path) assert tokens == expected_tokens
def test_tokenize_detokenize_sentencepiece(tmpdir): texts = ["a b c", "a ab c", "a b ac"] # Model should be trained model_path = Path(tmpdir) / "spm" tokens = tokenize(TokenizeMethod.SENTENCEPIECE, texts, model_path=model_path, vocab_size=7) # Control sequence indicating whitespace _ = "▁" expected_tokens = [ [_, "a", _, "b", _, "c"], [_, "a", _, "a", "b", _, "c"], [_, "a", _, "b", _, "a", "c"], ] assert tokens == expected_tokens assert detokenize(TokenizeMethod.SENTENCEPIECE, tokens, model_path) == texts # Previously should be reused with the old vocab size, and a new model # shouldn't be trained tokens = tokenize(TokenizeMethod.SENTENCEPIECE, texts, model_path=model_path) assert tokens == expected_tokens
def get_tokens(texts: List[str], tokenize_method: TokenizeMethod, vocab_size: int) -> List[List[str]]: try: return tokenize(tokenize_method, texts, vocab_size=vocab_size) except RuntimeError as e: str_e = str(e) if "vocab_size()" in str_e and "pieces_size()" in str_e: st.error( "SentencePiece requires your texts to have at least as many different tokens " "as its vocabulary size. Try a smaller vocabulary size.") return else: raise
def make_document_windows( X: List[str], window_len: int, y: Optional[List[T]] = None, tokenize_method: TokenizeMethod = TokenizeMethod.SPLIT, model_path: Optional[Path] = None, vocab_size: Optional[int] = None, ) -> Tuple[List[str], List[int], Optional[List[T]]]: """ This is a helper for when you have a dataset with long documents which is going to be passed through a model with a fixed max sequence length. If you don't have enough memory to raise the max sequence length, but you don't want to miss out on the information in longer documents, you can use this helper to generate a dataset that splits each document into windows roughly the size of your ``max_seq_len``. The resulting dataset can then be used to train your model. You should then use :func:`pool_document_windows` to pool the results from downstream tasks (ex. predictions, embeddings). Note there may still be some mismatch between the window size and the size as tokenized by your model, since some models use custom tokenization methods. Args: X: List of texts to make windows out of. window_len: The maximum length of each window. This should roughly correspond to the ``max_seq_len`` of your model. y: Optional list of classes (or list of list of labels). If passed, a corresponding list of targets for each window (the target(s) associated with the window's document) will be returned. tokenize_method: :class:`gobbli.util.TokenizeMethod` corresponding to the tokenization method to use for determining windows. model_path: This argument is used if the tokenization method requires training a model; otherwise, it's ignored. Path for a tokenization model. If it doesn't exist, a new tokenization model will be trained and saved at the given path. If it does exist, the existing model will be used. If no path is given, a temporary directory will be created/used and discarded vocab_size: Number of terms in the vocabulary for tokenization. May be ignored depending on the tokenization method and whether a model is already trained. Returns: A 3-tuple containing a new list of texts split into windows, a corresponding list containing the index of each original document for each window, and (optionally) a list containing a target per window. The index should be used to pool the output from the windowed text (see :func:`pool_document_windows`). """ X_windowed: List[str] = [] X_windowed_indices: List[int] = [] y_windowed: List[T] = [] # Create a temp dir in case it's needed with tempfile.TemporaryDirectory() as tmpdir: tokenize_kwargs: Dict[str, Any] = {} if model_path is None: model_path = Path(tmpdir) / "tokenizer" tokenize_kwargs["model_path"] = model_path detokenize_kwargs = tokenize_kwargs.copy() if vocab_size is not None: tokenize_kwargs["vocab_size"] = vocab_size for i, tokens in enumerate( tokenize(tokenize_method, X, **tokenize_kwargs)): for window in detokenize(tokenize_method, _chunk_tokens(tokens, window_len), **detokenize_kwargs): X_windowed.append(window) X_windowed_indices.append(i) if y is not None: y_windowed.append(y[i]) if y is not None: return X_windowed, X_windowed_indices, y_windowed else: return X_windowed, X_windowed_indices, None
def test_tokenize_spacy(text, tokens): # Spacy tokenization lowercases and removes non-alphabetic tokens assert tokenize(TokenizeMethod.SPACY, [text]) == [tokens]
def test_tokenize_split(text, tokens): # Whitespace tokenization just splits on whitespace assert tokenize(TokenizeMethod.SPLIT, [text]) == [tokens]