def test_Reader_file(): res = [] reader = Reader() for line in reader(os.path.join(DATA_PATH, 'a.md')): res.append(line) assert len(res) == 3 assert res[0].text == 'line 1 in a.'
def test_Reader_gen_plines(): lines = Reader.gen_plines(os.path.join(DATA_PATH, 'b.txt')) assert isinstance(lines, types.GeneratorType) == True assert len(list(lines)) == 3
def test_Reader_gen_flines(): paths = Reader.gen_files(DATA_PATH, '*.txt') articles = Reader.gen_articles(paths) lines = Reader.gen_flines(articles) assert isinstance(lines, types.GeneratorType) == True assert len(list(lines)) == 9
def test_Reader_gen_files(): paths = Reader.gen_files(DATA_PATH, '*.md') assert isinstance(paths, types.GeneratorType) == True assert len(list(paths)) == 3
def get_Reader_path_match_res(request): res = [] reader = Reader(request.param) for line in reader(DATA_PATH): res.append(line) return res
from pnlp.piop import Reader, Dict from pnlp.ptxt import Regex, Text, Length from pnlp.pnorm import NumNorm from pnlp.penh import TokenLevelSampler, SentenceLevelSampler from pnlp.pmag import MagicDict from pnlp.stopwords import StopWords from pnlp.stopwords import chinese_stopwords, english_stopwords from pnlp.utils import pstr, concurring, divide2int from pnlp.utils import generate_batches_by_num, generate_batches_by_size num_norm = NumNorm() reg = Regex() reader = Reader() tlsampler = TokenLevelSampler() slsampler = SentenceLevelSampler() __title__ = 'pnlp' __version__ = '0.4.0' __author__ = 'Yam' __license__ = 'Apache-2.0' __copyright__ = 'Copyright 2019, 2020 Yam' __all__ = ['Reader', 'Text', 'Regex', 'Length', 'MagicDict', 'NumNorm', 'StopWords', 'TokenLevelSampler', 'SentenceLevelSampler'
def test_Reader_gen_files_with_regex(): paths = Reader.gen_files(DATA_PATH, "(md)|(txt)", True) assert isinstance(paths, types.GeneratorType) == True assert len(list(paths)) == 6