def test_normalization(): stopwords = tokenization.load_stopwords("ru") input_str = "Я увидел! 32, тестирование 14:32 в по-новому." true_result = ["увидеть", "32", "тестирование", "14", "32", "новый"] result = tokenization.preprocess_text(input_str, stopwords) assert result == true_result
def index_json( filename: str, index_name: str, target_collumn: int, buffer_size: int = 10000, lang: str = "en", ): """Start indexing json file. Args: filename (str): name of source file. index_name (str): name of output index file. target_collumn (int): index of the processed column of json. buffer_size (int, optional): default buffer size. Defaults to 10000. """ stopwords = load_stopwords(lang) with open(filename, encoding="utf8") as file: total = sum(1 for line in file) data = iter_json(filename) data = ((i, preprocess_text(tup[target_collumn], stopwords)) for i, tup in enumerate(data)) data = tqdm(data, desc="Building Index", total=total) build_index(data, index_name, buffer_size)
def test_drop_stopwords(): stopwords = tokenization.load_stopwords("ru") test_text = ["я", "ходить", "в", "университет", "каждый", "день"] true_dropped = ["ходить", "университет", "каждый", "день"] dropped = tokenization.drop_stopwords(test_text, stopwords) assert dropped == true_dropped
def cli_text_search( text: str, dump_dir: str, data_file: str, n_results: int = 10, suggestion: bool = True, lang: str = "en", ): if suggestion: text = sample_text(text) print(f"Auto suggestion: {text}") stopwords = tokenization.load_stopwords(lang) tokens = [ t for t in re.split(r"\W+", text.lower().strip()) if t not in stopwords ] if len(tokens) > 1: query = Or(*symbols(",".join(tokens))) else: query = Symbol(tokens[0]) cli_search(query, dump_dir, data_file, n_results, text)
def test_load_stopwords(): stopwords = tokenization.load_stopwords("ru") assert isinstance(stopwords, set) assert len(stopwords) > 0 assert "и" in stopwords