def get_solr_results(search_form, page_size, max_pages, start_page=1, valid_ids=None, solr=None, offset=None): if not solr: solr = Solr(settings.SOLR_URL) query_filter = search_form.cleaned_data['filter'] if valid_ids: # Update solr filter to only return results in valid ids ids_filter = 'id:(' + ' OR '.join([str(item) for item in valid_ids]) + ')' if query_filter: query_filter += ' %s' % ids_filter else: query_filter = ids_filter solr_ids = [] solr_count = None try: current_page = start_page n_page_requests = 1 # Iterate over solr result pages while (len(solr_ids) < solr_count or solr_count == None) and n_page_requests <= max_pages: query = search_prepare_query(unquote( search_form.cleaned_data['query'] or ""), unquote(query_filter or ""), search_form.cleaned_data['sort'], current_page, page_size, grouping=False, include_facets=False, offset=offset) result = SolrResponseInterpreter(solr.select(unicode(query))) solr_ids += [element['id'] for element in result.docs] solr_count = result.num_found #print 'Solr page %i (total %i sounds)' % (current_page, solr_count) current_page += 1 n_page_requests += 1 except SolrException, e: raise ServerErrorException(msg='Search server error: %s' % e.message)
def get_solr_results(search_form, page_size, max_pages, start_page=1, valid_ids=None, solr=None, offset=None): if not solr: solr = Solr(settings.SOLR_URL) query_filter = search_form.cleaned_data["filter"] if valid_ids: # Update solr filter to only return results in valid ids ids_filter = "id:(" + " OR ".join([str(item) for item in valid_ids]) + ")" if query_filter: query_filter += " %s" % ids_filter else: query_filter = ids_filter solr_ids = [] solr_count = None try: current_page = start_page n_page_requests = 1 # Iterate over solr result pages while (len(solr_ids) < solr_count or solr_count == None) and n_page_requests <= max_pages: query = search_prepare_query( unquote(search_form.cleaned_data["query"] or ""), unquote(query_filter or ""), search_form.cleaned_data["sort"], current_page, page_size, grouping=False, include_facets=False, offset=offset, ) result = SolrResponseInterpreter(solr.select(unicode(query))) solr_ids += [element["id"] for element in result.docs] solr_count = result.num_found # print 'Solr page %i (total %i sounds)' % (current_page, solr_count) current_page += 1 n_page_requests += 1 except SolrException, e: raise ServerErrorException(msg="Search server error: %s" % e.message)
def filter_both(search_form, target_file=None, extra_parameters=None): """ Filter both strategy will first get either some results from solr and then check if returned results are also valid results in a gaia query, or the other way around. In gaia and solr we can restrict the query to a particular set of results, but there are limitations both in the length of the resulting url and in the number of OR clauses that solr can support. """ if not extra_parameters: extra_parameters = dict() solr_filter_id_block_size = extra_parameters.get( 'cs_solr_filter_id_block_size', 350) solr_filter_id_max_pages = extra_parameters.get( 'cs_solr_filter_id_max_pages', 7) solr_max_pages = extra_parameters.get('cs_max_solr_pages', 7) solr_page_size = extra_parameters.get('cs_solr_page_size', 1000) gaia_filter_id_block_size = extra_parameters.get( 'cs_gaia_filter_id_block_size', 350) gaia_filter_id_max_pages = extra_parameters.get( 'cs_gaia_filter_id_max_pages', 7) gaia_max_pages = extra_parameters.get('cs_max_gaia_pages', 1) gaia_page_size = extra_parameters.get( 'cs_gaia_page_size', 9999999) # We can get ALL gaia results at once if search_form.cleaned_data['target'] or target_file: # First search into gaia and then into solr (get all gaia results) gaia_ids, gaia_count, distance_to_target_data, note = get_gaia_results( search_form, target_file, page_size=gaia_page_size, max_pages=gaia_max_pages) valid_ids_pages = [ gaia_ids[i:i + solr_filter_id_block_size] for i in range(0, len(gaia_ids), solr_filter_id_block_size) if (i / solr_filter_id_block_size) < solr_filter_id_max_pages ] solr_ids = list() solr = Solr(settings.SOLR_URL) for valid_ids_page in valid_ids_pages: page_solr_ids, solr_count = get_solr_results( search_form, page_size=len(valid_ids_page), max_pages=1, valid_ids=valid_ids_page, solr=solr) solr_ids += page_solr_ids if gaia_count <= solr_filter_id_block_size * solr_filter_id_max_pages: # Got complete results, maybe we should log that? #print 'COMPLETE results (starting with gaia)' pass else: # First search into solr and then into gaia # These queries are SLOW because we need to get many pages from solr solr_ids, solr_count = get_solr_results(search_form, page_size=solr_page_size, max_pages=solr_max_pages) # Now we should split solr ids in blocks and iteratively query gaia restricting the results to those ids # present in the current block. However given that gaia results can be retrieved # all at once very quickly, we optimize this bit by retrieving them all at once and avoiding many requests # to similarity server. gaia_ids, gaia_count, distance_to_target_data, note = get_gaia_results( search_form, target_file, page_size=gaia_page_size, max_pages=gaia_max_pages) ''' # That would be the code without the optimization: valid_ids_pages = [solr_ids[i:i+gaia_filter_id_block_size] for i in range(0, len(solr_ids), gaia_filter_id_block_size) if (i/gaia_filter_id_block_size) < gaia_filter_id_max_pages] gaia_ids = list() distance_to_target_data = None note = None for valid_ids_page in valid_ids_pages: page_gaia_ids, page_gaia_count, page_distance_to_target_data, note = get_gaia_results(search_form, target_file, page_size=len(valid_ids_page), max_pages=1, valid_ids=valid_ids_page) gaia_ids += page_gaia_ids ''' if solr_count <= solr_page_size * solr_max_pages and gaia_count < gaia_page_size * gaia_max_pages: # Got complete results, maybe we should log that? #print 'COMPLETE results (starting with solr)' pass if search_form.cleaned_data['target'] or target_file: # Combined search, sort by gaia_ids results_a = gaia_ids results_b = solr_ids else: # Combined search, sort by solr ids results_a = solr_ids results_b = gaia_ids # Combine results results_b_set = set(results_b) combined_ids = [id for id in results_a if id in results_b_set] combined_count = len(combined_ids) return combined_ids[(search_form.cleaned_data['page'] - 1) * search_form.cleaned_data['page_size']:search_form.cleaned_data['page'] * search_form.cleaned_data['page_size']], \ combined_count, distance_to_target_data, None, note, None, None
def merge_optimized(search_form, target_file=None, extra_parameters=None): """ Filter both strategy will first get either some results from solr and then check if returned results are also valid results in a gaia query, or the other way around. In gaia and solr we can restrict the query to a particular set of results, but there are limitations both in the length of the resulting url and in the number of OR clauses that solr can support. """ if not extra_parameters: extra_parameters = dict() solr_filter_id_block_size = extra_parameters.get( 'cs_solr_filter_id_block_size', 350) solr_filter_id_max_pages = extra_parameters.get( 'cs_solr_filter_id_max_pages', 7) solr_max_requests = extra_parameters.get('cs_max_solr_requests', 20) solr_page_size = extra_parameters.get('cs_solr_page_size', 200) gaia_max_pages = extra_parameters.get('cs_max_gaia_pages', 1) gaia_page_size = extra_parameters.get( 'cs_gaia_page_size', 9999999) # We can get ALL gaia results at once num_requested_results = search_form.cleaned_data['page_size'] params_for_next_page = dict() debug_note = '' if search_form.cleaned_data['target'] or target_file: # First search into gaia and get all results that have not been checked in previous calls (indicated in request parameter 'cs_lcvidp') last_checked_valid_id_position = extra_parameters.get('cs_lcvidp', 0) if last_checked_valid_id_position < 0: last_checked_valid_id_position = 0 gaia_ids, gaia_count, distance_to_target_data, note = get_gaia_results( search_form, target_file, page_size=gaia_page_size, max_pages=gaia_max_pages, offset=last_checked_valid_id_position) if len(gaia_ids): # Now divide gaia results in blocks of "solr_filter_id_block_size" results and iteratively query solr limiting the # results to those ids in the common block to obtain common results for the search. # Once we get as many results as "num_requested_results" or we exceed a maximum number # of iterations (solr_filter_id_max_pages), return what we got and update 'cs_lcvidp' parameter for further calls. valid_ids_pages = [ gaia_ids[i:i + solr_filter_id_block_size] for i in range(0, len(gaia_ids), solr_filter_id_block_size) ] solr_ids = list() checked_gaia_ids = list() solr = Solr(settings.SOLR_URL) for count, valid_ids_page in enumerate(valid_ids_pages): page_solr_ids, solr_count = get_solr_results( search_form, page_size=len(valid_ids_page), max_pages=1, valid_ids=valid_ids_page, solr=solr) solr_ids += page_solr_ids checked_gaia_ids += valid_ids_page if len(solr_ids) >= num_requested_results: debug_note = 'Found enough results in %i solr requests' % ( count + 1) #print 'Did %i requests to solr' % (count + 1) break if count + 1 > solr_filter_id_max_pages: debug_note = 'Did %i solr requests (still not enough results)' % ( count + 1) #print 'Too many requests and not enough results' break combined_ids = list() for index, sid in enumerate(checked_gaia_ids): if sid in solr_ids: combined_ids.append(sid) new_last_checked_valid_id_position = index + 1 if len(combined_ids) == num_requested_results: break if len(checked_gaia_ids) == len(gaia_ids): params_for_next_page['no_more_results'] = True params_for_next_page[ 'cs_lcvidp'] = last_checked_valid_id_position + new_last_checked_valid_id_position else: # No more gaia ids to check against solr, no more possible results! combined_ids = list() distance_to_target_data = dict() note = None params_for_next_page['no_more_results'] = True else: # First search into gaia to obtain a list of all sounds that match content-based query parameters gaia_ids, gaia_count, distance_to_target_data, note = get_gaia_results( search_form, target_file, page_size=gaia_page_size, max_pages=gaia_max_pages) last_retrieved_solr_id_pos = extra_parameters.get('cs_lrsidp', 0) if last_retrieved_solr_id_pos < 0: last_retrieved_solr_id_pos = 0 if len(gaia_ids) < solr_filter_id_block_size: # optimization, if there are few gaia_ids, we can get all results in one query solr_ids, solr_count = get_solr_results( search_form, page_size=len(gaia_ids), max_pages=1, valid_ids=gaia_ids, offset=last_retrieved_solr_id_pos) combined_ids = solr_ids[:num_requested_results] params_for_next_page[ 'cs_lrsidp'] = last_retrieved_solr_id_pos + num_requested_results if len(combined_ids) < num_requested_results: params_for_next_page['no_more_results'] = True else: # Now query solr starting at the last retrieved solr result position (parameter 'cs_lrsidp') and iteratively combine the results of # each page of the query with gaia ids. Once we reach the desired "num_requested_results", return what we got and # update 'cs_lrsidp' parameter for further queries. Set a maximum number of iterations (solr_max_requests) to prevent a virtually # infinite query if not enough results are found (num_requested_results is not reached). combined_ids = list() new_last_retrieved_solr_id_pos = last_retrieved_solr_id_pos stop_main_for_loop = False n_requests_made = 0 for i in range(0, solr_max_requests): if stop_main_for_loop: continue offset = last_retrieved_solr_id_pos + i * solr_page_size solr_ids, solr_count = get_solr_results( search_form, page_size=solr_page_size, max_pages=1, offset=offset) n_requests_made += 1 common_ids = list(set(solr_ids).intersection(gaia_ids)) for index, sid in enumerate(solr_ids): new_last_retrieved_solr_id_pos += 1 if sid in common_ids: combined_ids.append(sid) if len(combined_ids) == num_requested_results: stop_main_for_loop = True break if new_last_retrieved_solr_id_pos == solr_count: params_for_next_page['no_more_results'] = True stop_main_for_loop = True break if n_requests_made == solr_max_requests and len( combined_ids) < num_requested_results: debug_note = 'Did %i solr requests (still not enough results)' % n_requests_made #print 'Too many requests and not enough results' else: debug_note = 'Found enough results in %i solr requests' % n_requests_made #print 'Did %i requests to solr' % n_requests_made params_for_next_page['cs_lrsidp'] = new_last_retrieved_solr_id_pos # Combine results return combined_ids, len( combined_ids ), distance_to_target_data, None, note, params_for_next_page, debug_note
raise NotFoundException(msg=e.message) else: raise ServerErrorException(msg='Similarity server error: %s' % e.message) except Exception, e: raise ServerErrorException( msg= 'The similarity server could not be reached or some unexpected error occurred.' ) elif not search_form.cleaned_data[ 'descriptors_filter'] and not search_form.cleaned_data[ 'target'] and not target_file: # Standard text-based search try: solr = Solr(settings.SOLR_URL) query = search_prepare_query( unquote(search_form.cleaned_data['query'] or ""), unquote(search_form.cleaned_data['filter'] or ""), search_form.cleaned_data['sort'], search_form.cleaned_data['page'], search_form.cleaned_data['page_size'], grouping=search_form.cleaned_data['group_by_pack'], include_facets=False) result = SolrResponseInterpreter(solr.select(unicode(query))) solr_ids = [element['id'] for element in result.docs] solr_count = result.num_found more_from_pack_data = None if search_form.cleaned_data['group_by_pack']:
if e.status_code == 500: raise ServerErrorException(msg=e.message) elif e.status_code == 400: raise BadRequestException(msg=e.message) elif e.status_code == 404: raise NotFoundException(msg=e.message) else: raise ServerErrorException(msg='Similarity server error: %s' % e.message) except Exception, e: raise ServerErrorException(msg='The similarity server could not be reached or some unexpected error occurred.') elif not search_form.cleaned_data['descriptors_filter'] and not search_form.cleaned_data['target'] and not target_file: # Standard text-based search try: solr = Solr(settings.SOLR_URL) query = search_prepare_query(unquote(search_form.cleaned_data['query'] or ""), unquote(search_form.cleaned_data['filter'] or ""), search_form.cleaned_data['sort'], search_form.cleaned_data['page'], search_form.cleaned_data['page_size'], grouping=search_form.cleaned_data['group_by_pack'], include_facets=False) result = SolrResponseInterpreter(solr.select(unicode(query))) solr_ids = [element['id'] for element in result.docs] solr_count = result.num_found more_from_pack_data = None if search_form.cleaned_data['group_by_pack']: # If grouping option is on, store grouping info in a dictionary that we can add when serializing sounds