def test_sliding_window(self): corpus = FileCorpus(path_text_file) sliding_window_iter = corpus.get_sliding_window_iterator() for i, s in enumerate(sliding_window_iter): if i >= 2: break assert s == {'current': 'long', 'context': ['family', 'dashwood', 'settled', 'sussex']}
def test_file_corpus(self): corpus = FileCorpus(path_text_file) tokens_iter = corpus.get_token_iterator(verbose=1) total_words, words = count_words_and_collect_prefix(tokens_iter) print("!!!!!total words", total_words) assert total_words == TEST_TEXT_LEN assert '|'.join(words) == TEST_FIRST_10_WORDS
def create_from_file(path, min_frequency=0): """Collects vocabulary from a corpus by a given file path. """ if not os.path.isfile(path): raise RuntimeError("source file does not exist") iter = FileCorpus(path).get_token_iterator() v = _create_from_iterator(iter, min_frequency) return v
def create_from_file(path, min_frequency=0, language='eng'): """Collects vocabulary from a corpus by a given file path. """ if not os.path.isfile(path): raise RuntimeError("source file does not exist") tokenizer = Tokenizer(stopwords=[]) iter = FileCorpus(path, language).get_token_iterator(tokenizer=tokenizer) v = _create_from_iterator(iter, min_frequency) return v
def create_from_path(path, min_frequency=0, language='eng'): """Collects vocabulary from a corpus by a given directory path. """ tokenizer = Tokenizer(stopwords=[]) if os.path.isfile(path): iter = FileCorpus(path, language).get_token_iterator(tokenizer=tokenizer) else: if os.path.isdir(path): iter = DirCorpus(path, language).get_token_iterator(tokenizer) else: raise RuntimeError("source path can not be read") # TODO: add option for stopwords v = _create_from_iterator(iter, min_frequency) return v
def test_sentence(self): corpus = FileCorpus(path_text_file) sentence_iter = corpus.get_sentence_iterator(verbose=True) for s in sentence_iter: assert s == ['family', 'dashwood', 'long', 'settled', 'sussex'] break
def test_file_corpus(self): corpus = FileCorpus(path_text_file) tokens_iter = corpus.get_token_iterator(verbose=1) total_words, words = count_words_and_collect_prefix(tokens_iter) print("!!!!!total words", total_words)