Python load_stopwords示例

编程语言: Python

命名空间/包名称: boosearch.tokenization

方法/功能: load_stopwords

hotexamples.com的示例: 5

Python load_stopwords - 已找到5个示例。这些是从开源项目中提取的最受好评的boosearch.tokenization.load_stopwords现实Python示例。您可以评价示例，以帮助我们提高示例质量。

示例#1

显示文件

文件： test_tokenization.py 项目： gooppe/ir-20

def test_normalization():
    stopwords = tokenization.load_stopwords("ru")

    input_str = "Я увидел! 32,    тестирование 14:32 в по-новому."
    true_result = ["увидеть", "32", "тестирование", "14", "32", "новый"]
    result = tokenization.preprocess_text(input_str, stopwords)
    assert result == true_result

示例#2

显示文件

文件： index.py 项目： gooppe/ir-20

def index_json(
    filename: str,
    index_name: str,
    target_collumn: int,
    buffer_size: int = 10000,
    lang: str = "en",
):
    """Start indexing json file.

    Args:
        filename (str): name of source file.
        index_name (str): name of output index file.
        target_collumn (int): index of the processed column of json.
        buffer_size (int, optional): default buffer size. Defaults to 10000.
    """
    stopwords = load_stopwords(lang)

    with open(filename, encoding="utf8") as file:
        total = sum(1 for line in file)

    data = iter_json(filename)
    data = ((i, preprocess_text(tup[target_collumn], stopwords))
            for i, tup in enumerate(data))
    data = tqdm(data, desc="Building Index", total=total)

    build_index(data, index_name, buffer_size)

示例#3

显示文件

文件： test_tokenization.py 项目： gooppe/ir-20

def test_drop_stopwords():
    stopwords = tokenization.load_stopwords("ru")

    test_text = ["я", "ходить", "в", "университет", "каждый", "день"]
    true_dropped = ["ходить", "университет", "каждый", "день"]
    dropped = tokenization.drop_stopwords(test_text, stopwords)

    assert dropped == true_dropped

示例#4

显示文件

文件： search.py 项目： gooppe/ir-20

def cli_text_search(
    text: str,
    dump_dir: str,
    data_file: str,
    n_results: int = 10,
    suggestion: bool = True,
    lang: str = "en",
):
    if suggestion:
        text = sample_text(text)
        print(f"Auto suggestion: {text}")

    stopwords = tokenization.load_stopwords(lang)
    tokens = [
        t for t in re.split(r"\W+",
                            text.lower().strip()) if t not in stopwords
    ]
    if len(tokens) > 1:
        query = Or(*symbols(",".join(tokens)))
    else:
        query = Symbol(tokens[0])
    cli_search(query, dump_dir, data_file, n_results, text)

示例#5

显示文件

文件： test_tokenization.py 项目： gooppe/ir-20

def test_load_stopwords():
    stopwords = tokenization.load_stopwords("ru")

    assert isinstance(stopwords, set)
    assert len(stopwords) > 0
    assert "и" in stopwords