def similarity(query, string): """ Calculate the match for the given `query` and `string`. The match is calculated using the `jaro winkler` for each set of the matrix (`query` x `string`) and takes into consideration the position difference into the strings. Arguments: query (str): search query string (str): string to test against Returns: float: normalized value indicating the probability of match, where 0 means completely dissimilar and 1 means equal. """ # split the two strings cleaning out some stuff query = utils.tokenize(query.lower()) string = utils.tokenize(string.lower()) # if one of the two strings is falsy (no content, or was passed with items # short enough to be trimmed out), return 0 here to avoid ZeroDivisionError # later on while processing. if len(query) == 0 or len(string) == 0: return 0 shortest, longest = sorted((query, string), key=lambda x: len(x)) # matrix of tuples for each segment of both query and string matrix = [(s1, s2) for s1 in longest for s2 in shortest] matches = {} for string1, string2 in matrix: # get the jaro winkler equality between the two strings match = jf.jaro_winkler(string1, string2) # calculate the distance factor for the position of the segments # on their respective lists positional = utils.position_similarity( string1, string2, longest, shortest) # get them together and append to the matches dictionary match = (match, positional) matches.setdefault(string1, []).append(match) # get the highest value for each list, the apply the word-distance factor # the key takes the jaro winkler distance value to get the max value matches = [max(m, key=lambda x: x[0]) for m in matches.values()] _weights = (config.MATCH_WEIGHT, config.DIST_WEIGHT) matches = [utils.weighted_average((m, d), _weights) for m, d in matches] # get the weighted mean for all the highest matches and apply the highest # match value found as coefficient as multiplier, to add weights to more # coherent matches. mean_match = (sum(matches) / len(matches)) * max(matches) return mean_match
def search_for(request): if 'q' not in request.GET: return HttpResponseRedirect('/') keywords = tokenize(request.GET['q']) if not keywords: return HttpResponseRedirect('/') pages = WikiPage.objects selected_lists = [] # filter out lists if requested if 'lists' in request.POST: if request.POST.get('lists'): selected_lists = request.POST.get('lists').split(',') pages = pages.filter(lists__url_name__in=selected_lists) selected_categories = [] # filter out categories if requested if 'categories' in request.POST: if request.POST.get('categories'): selected_categories = request.POST.get('categories').split(',') pages = pages.filter(categories__url_name__in=selected_categories) # filter out pages that do not match all the keywords for keyword in keywords: pages = pages.filter(pagekeyword__keyword=keyword) # use subquery to calculate the weight of result pages = pages.extra(select={ 'weight': 'SELECT SUM(count) ' + 'FROM search_pagekeyword ' + 'WHERE search_pagekeyword.page_id = ' + 'wikipage_wikipage.url_name AND ' + 'keyword in %s'}, select_params=(tuple(keywords),), ).order_by('-weight') selected_lists_objects = \ WikiList.objects.filter(url_name__in=selected_lists) selected_cates_objects = \ WikiCategory.objects.filter(url_name__in=selected_categories) context = { 'pages': pages.all(), 'lists': WikiList.objects.order_by('title').all(), 'selected_lists': selected_lists_objects, 'categories': WikiCategory.objects.order_by('title').all(), 'selected_cates': selected_cates_objects, 'keyword': request.GET['q'], } template = 'search/results.html' if request.is_ajax(): template = 'search/results_items.html' return render(request, template, context)
def parse_page(url_name): logger.info('Parsing ' + url_name) page = WikiPage.objects.get(url_name=url_name) # remove noisy info on the page soup = BeautifulSoup(page.body, 'html.parser') for class_name in ('reflist', 'citation', 'navbox', 'noprint', 'reference', 'vertical-navbox'): for item in soup.find_all(class_=class_name): item.extract() soup.find(id='siteSub').extract() soup = BeautifulSoup(str(soup), 'html.parser') # count and save tokens = tokenize(soup.get_text()) counts, total = Counter(tokens), len(tokens) title_tokens = tokenize(page.title) t_counts, t_total = Counter(title_tokens), len(title_tokens) for keyword in counts: key = PageKeyword() key.keyword = keyword # Term Frequency: n / sum + tn / t_sum key.count = counts[keyword] / total if keyword in t_counts: key.count += t_counts[keyword] / t_total key.page = page key.save() for keyword in t_counts: if keyword in counts: continue key = PageKeyword() key.keyword = keyword key.count = t_counts[keyword] / t_total key.page = page key.save()