import collections import numpy as np import pytest import spacy from spacy.tokens import Doc from textacy import Corpus from textacy import load_spacy_lang from textacy.datasets.capitol_words import CapitolWords DATASET = CapitolWords() pytestmark = pytest.mark.skipif( DATASET.filepath is None, reason="CapitolWords dataset must be downloaded before running tests", ) @pytest.fixture(scope="module") def corpus(): return Corpus("en", data=DATASET.records(limit=5)) class TestCorpusInit: def test_corpus_init_lang(self): assert isinstance(Corpus("en"), Corpus) assert isinstance(Corpus(load_spacy_lang("en")), Corpus) for bad_lang in (b"en", None): with pytest.raises(TypeError): Corpus(bad_lang)
def test_ioerror(self): dataset = CapitolWords(data_dir=self.tempdir) with self.assertRaises(IOError): _ = list(dataset.texts())
def test_download(self): dataset = CapitolWords(data_dir=self.tempdir) dataset.download() self.assertTrue(os.path.exists(dataset.filename))
def test_ioerror(tmpdir): dataset = CapitolWords(data_dir=str(tmpdir)) with pytest.raises(IOError): _ = list(dataset.texts())
def test_download(tmpdir): dataset = CapitolWords(data_dir=str(tempdir)) dataset.download() assert os.path.exists(dataset.filename)
def test_download(tmpdir): dataset = CapitolWords(data_dir=str(tmpdir)) dataset.download() assert os.path.isfile(dataset._filepath)