Пример #1
0
def are_too_similar(product1, product2, product_models_by_product, filter_field, threshold):
    product1_model = product_models_by_product.get(product1)
    if product1_model is None:
        return False

    product2_model = product_models_by_product.get(product2)
    if product2_model is None:
        return False

    product1_stems = product1_model.get_attribute(filter_field)
    product2_stems = product2_model.get_attribute(filter_field)
    n_common_terms = text.count_common_terms(product1_stems, product2_stems)

    return n_common_terms > threshold
Пример #2
0
def test_count_common_terms_English():
    """ Tests common terms counting.
    """
    language = "english"
    text1 = "Just a test sentence for the purpose of just testing common terms counting."
    text2 = "This is just a sentence for tests purposes."
    text1_tokens = text.tokenize(text1)
    text2_tokens = text.tokenize(text2)
    text1_stems = text.get_stems(text1_tokens, language)
    text2_stems = text.get_stems(text2_tokens, language)
    text1_stems_no_stopwords = set(text.remove_stopwords(text1_stems, language))
    text2_stems_no_stopwords = set(text.remove_stopwords(text2_stems, language))
    nose.tools.eq_(text.count_common_terms(text1_stems_no_stopwords,
                                           text2_stems_no_stopwords),
                   3)  # sentence, purpos3, tests