def find_tokenized_variety(output_token_dict, conditions, threshold, similarity_metric, stemmer, tokenizer_name): """ search for most relevant matches to query based on tokenization :param output_token_dict: The pre-known list of conditions :param conditions: The list of conditions to consider :param threshold: The minimum similarity to retrain :param similarity_metric: Which similarity strategy to use :param stemmer: Which stemmer to use :param tokenizer_name: Which tokenizer to use :type output_token_dict: dict :type conditions: list :type threshold: float :type similarity_metric: str :type stemmer: str :type tokenizer_name: str :return: the top search matches :rtype: list """ total_dict = {} for condition in conditions: condition = condition.strip().lower() if 'whitespace' == tokenizer_name: tokens = tokenizer.whitespace_tokenize(condition, stemmer) elif 'nltk' == tokenizer_name: tokens = tokenizer.nltk_tokenize(condition) else: tokens = [] for item in output_token_dict: item_tokens = output_token_dict[item] # here we compare tokens and item_tokens if 'cosine' == similarity_metric: similarity = metrics.cosine_similarity(tokens, item_tokens) elif 'jaccard' == similarity_metric: similarity = metrics.jaccard_similarity(tokens, item_tokens) else: similarity = metrics.harmonic_similarity(tokens, item_tokens) if similarity > threshold: # add the similarity so that we can rank descending if item in total_dict: if similarity > total_dict[item]: total_dict[item] = similarity else: total_dict[item] = similarity print(condition + ' -> ' + item) print(similarity) sorted_by_similarity = sorted(total_dict.items(), key=operator.itemgetter(1), reverse=True) return sorted_by_similarity
def test_jaccard_similarity_left_empty(): """ test jaccard similarity if the 1st parameter is empty """ assert metrics.jaccard_similarity(LEFT_EMPTY, RIGHT_POPULATED_1) == 0.0
def test_jaccard_similarity_double_empty(): """ test jaccard similarity for 2 empty lists """ assert metrics.jaccard_similarity(LEFT_EMPTY, RIGHT_EMPTY) == 0.0
def test_jaccard_similarity_populated_2(): """ test jaccard similarity for another pair of populated lists """ assert metrics.jaccard_similarity(LEFT_POPULATED_2, RIGHT_POPULATED_2) == 0.2
def test_jaccard_similarity_populated_1(): """ test jaccard similarity for one pair of populated lists """ assert metrics.jaccard_similarity(LEFT_POPULATED_1, RIGHT_POPULATED_1) == 1.0
def test_jaccard_similarity_right_empty(): """ test jaccard similarity if the 2nd parameter is empty """ assert metrics.jaccard_similarity(LEFT_POPULATED_1, RIGHT_EMPTY) == 0.0