예제 #1
0
def test_remove_stopwords(_nltk, text_data, tokenized_test_list_main):
    assert (to_list(
        remove_stopwords(to_list(to_lower(tokenized_test_list_main)))) ==
            text_data["answer_key_remove_stop_words"]
            ), "Output did not correctly remove NLTK stopwords from documents"
    assert (
        to_list(
            remove_stopwords(
                to_list(to_lower(tokenized_test_list_main)),
                custom_stopwords=text_data["more_words"],
            )) == text_data["answer_key_remove_stop_words_more"]
    ), "Output did not correctly remove NLTK stopwords and custom added stopwords from documents"
예제 #2
0
def test_remove_punct(_nltk, text_data, tokenized_test_list_main):
    assert (
        to_list(remove_punct(
            tokenized_test_list_main)) == text_data["answer_key_remove_punct"]
    ), "Output did not correctly remove punctuation inside of words from documents"
    assert (
        to_list(
            remove_punct(tokenized_test_list_main,
                         remove_all=True,
                         replace_char="|")) ==
        text_data["answer_key_replace_all_punct_with_pipe"]
    ), "Output did not correctly replace all punctuation with the | character"
    assert (to_list(remove_punct(
        tokenized_test_list_main,
        remove_all=True)) == text_data["answer_key_remove_all_punct"]
            ), "Output did not correctly remove all punctuation from documents"
예제 #3
0
def test_remove_single_char_and_spaces(text_data):
    assert (
        to_list(
            remove_single_char_and_spaces(
                text_data["test_list_single_char_and_spaces"])) ==
        text_data["answer_key_single_char_and_spaces"]
    ), "Output did not correctly remove all instance of single character words and spaces from documents"
예제 #4
0
def test_custom_pipeline(_nltk, text_data):
    def shout(text_docs_bow):
        return [[word.upper() for word in doc] for doc in text_docs_bow]

    assert (
        to_list(
            preprocess_texts(
                text_data["test_list_custom"],
                custom_pipeline=["tokenize",
                                 shout])) == text_data["answer_key_custom"]
    ), "Output did not correctly incorporate custom function into preprocessing pipeline"
예제 #5
0
def test_bag_of_words(_nltk, text_data):
    bag = to_list(
        bag_of_words_to_docs(preprocess_texts(text_data["test_list_main"])))

    assert len(text_data["test_list_main"]) == len(
        bag
    ), "Number of documents in input does not match number of documents in output"
    assert isinstance(
        bag, list), "Output is not of the expected return type of list"
    assert isinstance(
        bag[0], str
    ), "Output does not contain expected type of string inside of return value"
예제 #6
0
def test_remove_digits(text_data):
    assert (to_list(remove_digits(text_data["test_list_digits"])) ==
            text_data["answer_key_remove_digits"]
            ), "Output did not correctly remove all digits from documents"
예제 #7
0
def test_to_lower(_nltk, text_data, tokenized_test_list_main):
    assert (to_list(
        to_lower(tokenized_test_list_main)) == text_data["answer_key_lower"]
            ), "Output did not correctly convert documents to lowercase"
예제 #8
0
def test_tokenizer(_nltk, text_data):
    assert (to_list(tokenize(
        text_data["test_list_main"])) == text_data["answer_key_tokenized"]
            ), "Output did not correctly tokenize documents"
예제 #9
0
def tokenized_test_list_main(text_data):
    return to_list(tokenize(text_data["test_list_main"]))