def are_too_similar(product1, product2, product_models_by_product, filter_field, threshold): product1_model = product_models_by_product.get(product1) if product1_model is None: return False product2_model = product_models_by_product.get(product2) if product2_model is None: return False product1_stems = product1_model.get_attribute(filter_field) product2_stems = product2_model.get_attribute(filter_field) n_common_terms = text.count_common_terms(product1_stems, product2_stems) return n_common_terms > threshold
def test_count_common_terms_English(): """ Tests common terms counting. """ language = "english" text1 = "Just a test sentence for the purpose of just testing common terms counting." text2 = "This is just a sentence for tests purposes." text1_tokens = text.tokenize(text1) text2_tokens = text.tokenize(text2) text1_stems = text.get_stems(text1_tokens, language) text2_stems = text.get_stems(text2_tokens, language) text1_stems_no_stopwords = set(text.remove_stopwords(text1_stems, language)) text2_stems_no_stopwords = set(text.remove_stopwords(text2_stems, language)) nose.tools.eq_(text.count_common_terms(text1_stems_no_stopwords, text2_stems_no_stopwords), 3) # sentence, purpos3, tests