def calculate_hosted_collections_search_params(req, pattern_list, field, hosted_collections, verbosity_level=0): """Calculate the searching parameters for the selected hosted collections i.e. the actual hosted search engines and the basic search units""" from invenio.legacy.search_engine import create_basic_search_units assert req vprint = get_verbose_print(req, 'Hosted collections (calculate_hosted_collections_search_params): ', verbosity_level) pattern = bind_patterns(pattern_list) vprint(3, 'pattern = %s' % cgi.escape(pattern)) # if for any strange reason there is no pattern, just return # UPDATE : let search go on even there is no pattern (an empty pattern_list and field) #if not pattern: return (None, None) # calculate the basic search units basic_search_units = create_basic_search_units(None, pattern, field) vprint(3, 'basic_search_units = %s' % cgi.escape(repr(basic_search_units))) # calculate the set of hosted search engines hosted_search_engines = select_hosted_search_engines(hosted_collections) vprint(3, 'hosted_search_engines = ' + str(hosted_search_engines)) # no need really to print out a sorted list of the hosted search engines, is there? I'll leave this commented out #hosted_search_engines_list = external_collection_sort_engine_by_name(hosted_search_engines) #vprint(3, 'hosted_search_engines_list (sorted) : ' + str(hosted_search_engines_list)) return (hosted_search_engines, basic_search_units)
def print_external_results_overview(req, current_collection, pattern_list, field, external_collection, verbosity_level=0, lang=CFG_SITE_LANG, print_overview=True): """Print the external collection overview box. Return the selected external collections and parsed query""" from invenio.legacy.search_engine import create_basic_search_units assert req vprint = get_verbose_print(req, 'External collection (print_external_results_overview): ', verbosity_level) pattern = bind_patterns(pattern_list) vprint(3, 'pattern = %s' % cgi.escape(pattern)) if not pattern: return (None, None, None, None) basic_search_units = create_basic_search_units(None, pattern, field) vprint(3, 'basic_search_units = %s' % cgi.escape(repr(basic_search_units))) (search_engines, seealso_engines) = select_external_engines(current_collection, external_collection) vprint(3, 'search_engines = ' + str(search_engines)) vprint(3, 'seealso_engines = ' + str(seealso_engines)) search_engines_list = external_collection_sort_engine_by_name(search_engines) vprint(3, 'search_engines_list (sorted) : ' + str(search_engines_list)) if print_overview: html = template.external_collection_overview(lang, search_engines_list) req.write(html) return (search_engines, seealso_engines, pattern, basic_search_units)
def calculate_hosted_collections_search_params(req, pattern_list, field, hosted_collections, verbosity_level=0): """Calculate the searching parameters for the selected hosted collections i.e. the actual hosted search engines and the basic search units""" from invenio.legacy.search_engine import create_basic_search_units assert req vprint = get_verbose_print( req, 'Hosted collections (calculate_hosted_collections_search_params): ', verbosity_level) pattern = bind_patterns(pattern_list) vprint(3, 'pattern = %s' % cgi.escape(pattern)) # if for any strange reason there is no pattern, just return # UPDATE : let search go on even there is no pattern (an empty pattern_list and field) #if not pattern: return (None, None) # calculate the basic search units basic_search_units = create_basic_search_units(None, pattern, field) vprint(3, 'basic_search_units = %s' % cgi.escape(repr(basic_search_units))) # calculate the set of hosted search engines hosted_search_engines = select_hosted_search_engines(hosted_collections) vprint(3, 'hosted_search_engines = ' + str(hosted_search_engines)) # no need really to print out a sorted list of the hosted search engines, is there? I'll leave this commented out #hosted_search_engines_list = external_collection_sort_engine_by_name(hosted_search_engines) #vprint(3, 'hosted_search_engines_list (sorted) : ' + str(hosted_search_engines_list)) return (hosted_search_engines, basic_search_units)
def get_answers(req, user_info, of, cc, colls_to_search, p, f, ln): """Return answers from all registered search services.""" if p: from invenio.legacy.search_engine import create_basic_search_units search_units = create_basic_search_units(req, p, f) else: search_units = [] def search_service_answers(): for search_service in registry.services: yield search_service.answer(req, user_info, of, cc, colls_to_search, p, f, search_units, ln) nb_answers = 0 best_relevance = None for answer_relevance, answer_html in sorted(search_service_answers(), reverse=True): nb_answers += 1 if best_relevance is None: best_relevance = answer_relevance if best_relevance <= CFG_WEBSEARCH_SERVICE_MIN_RELEVANCE_TO_DISPLAY: # The answer is not relevant enough break if nb_answers > CFG_WEBSEARCH_SERVICE_MAX_NB_SERVICE_DISPLAY: # We have reached the max number of service to display break if best_relevance - answer_relevance > CFG_WEBSEARCH_SERVICE_MAX_RELEVANCE_DIFFERENCE: # The service gave an answer that is way less good than previous # ones. break yield answer_relevance, answer_html if answer_relevance == CFG_WEBSEARCH_SERVICE_MAX_SERVICE_ANSWER_RELEVANCE: # The service assumes it has given the definitive answer break
def calculate_external_search_params(pattern_list, field, hosted_colls): """Function that calculates the basic search units given the search pattern. Also returns a set of hosted collections engines.""" from invenio.legacy.search_engine import create_basic_search_units from invenio.legacy.websearch_external_collections import bind_patterns from invenio.legacy.websearch_external_collections import select_hosted_search_engines as select_external_search_engines pattern = bind_patterns(pattern_list) basic_search_units = create_basic_search_units(None, pattern, field) external_search_engines = select_external_search_engines(hosted_colls) return (external_search_engines, basic_search_units)
def get_answers(req, user_info, of, cc, colls_to_search, p, f, ln): """Return answers from all registered search services.""" if p: from invenio.legacy.search_engine import create_basic_search_units search_units = create_basic_search_units(req, p, f) else: search_units = [] def search_service_answers(): for search_service in registry.services: yield search_service.answer(req, user_info, of, cc, colls_to_search, p, f, search_units, ln) nb_answers = 0 best_relevance = None for answer_relevance, answer_html in sorted(search_service_answers(), reverse=True): nb_answers += 1 if best_relevance is None: best_relevance = answer_relevance if best_relevance <= CFG_WEBSEARCH_SERVICE_MIN_RELEVANCE_TO_DISPLAY: # The answer is not relevant enough break if nb_answers > CFG_WEBSEARCH_SERVICE_MAX_NB_SERVICE_DISPLAY: # We have reached the max number of service to display break if best_relevance - answer_relevance > \ CFG_WEBSEARCH_SERVICE_MAX_RELEVANCE_DIFFERENCE: # The service gave an answer that is way less good than previous # ones. break yield answer_relevance, answer_html if answer_relevance == \ CFG_WEBSEARCH_SERVICE_MAX_SERVICE_ANSWER_RELEVANCE: # The service assumes it has given the definitive answer break
def print_external_results_overview(req, current_collection, pattern_list, field, external_collection, verbosity_level=0, lang=CFG_SITE_LANG, print_overview=True): """Print the external collection overview box. Return the selected external collections and parsed query""" from invenio.legacy.search_engine import create_basic_search_units assert req vprint = get_verbose_print( req, 'External collection (print_external_results_overview): ', verbosity_level) pattern = bind_patterns(pattern_list) vprint(3, 'pattern = %s' % cgi.escape(pattern)) if not pattern: return (None, None, None, None) basic_search_units = create_basic_search_units(None, pattern, field) vprint(3, 'basic_search_units = %s' % cgi.escape(repr(basic_search_units))) (search_engines, seealso_engines) = select_external_engines(current_collection, external_collection) vprint(3, 'search_engines = ' + str(search_engines)) vprint(3, 'seealso_engines = ' + str(seealso_engines)) search_engines_list = external_collection_sort_engine_by_name( search_engines) vprint(3, 'search_engines_list (sorted) : ' + str(search_engines_list)) if print_overview: html = template.external_collection_overview(lang, search_engines_list) req.write(html) return (search_engines, seealso_engines, pattern, basic_search_units)
def word_similarity_solr(pattern, hitset, params, verbose, explicit_field, ranked_result_amount): """ Ranking a records containing specified words and returns a sorted list. input: hitset - a list of hits for the query found by search_engine verbose - verbose value explicit_field - field to search (selected in GUI) ranked_result_amount - amount of results to be ranked output: recset - a list of sorted records: [[23,34], [344,24], [1,01]] prefix - what to show before the rank value postfix - what to show after the rank value voutput - contains extra information, content dependent on verbose value """ voutput = "" search_units = [] if not len(hitset): return ([], "", "", voutput) if pattern: pattern = " ".join(map(str, pattern)) from invenio.legacy.search_engine import create_basic_search_units search_units = create_basic_search_units(None, pattern, explicit_field) else: return ( None, "Records not ranked. The query is not detailed enough, or not enough records found, for ranking to be possible.", "", voutput) if verbose > 0: voutput += "Hitset: %s<br/>" % hitset voutput += "Pattern: %s<br/>" % pattern voutput += "Search units: %s<br/>" % search_units query = "" (ranked_result, matched_recs) = (None, None) # Ranks similar records if search_units[0][2] == 'recid': recid = search_units[0][1] if verbose > 0: voutput += "Ranked amount: %s<br/>" % ranked_result_amount try: (ranked_result, matched_recs) = solr_get_similar_ranked(recid, hitset, params, ranked_result_amount) except: register_exception() return ( None, "Records not ranked. An error occurred. Please check the query.", "", voutput) # Cutoffs potentially large hitset it = itertools.islice(hitset, params['find_similar_to_recid']['hitset_cutoff']) hitset = intbitset(list(it)) # Regular word similarity ranking else: for (operator, pattern, field, unit_type) in search_units: # Any field if field == '': field = 'global' # Field might not exist elif field not in params["fields"].keys(): field = params["default_field"] if unit_type == "a": # Eliminates leading and trailing % if pattern[0] == "%": pattern = pattern[1:-1] pattern = "\"" + pattern + "\"" weighting = "^" + str(params["fields"][field]["weight"]) if ':' in pattern: pattern = pattern.rsplit(':', 1)[1] query_part = field + ":" + pattern + weighting # Considers boolean operator from the second part on, allows negation from the first part on if query or operator == "-": query += " " + BOOLEAN_EQUIVALENTS[operator] + " " query += query_part + " " if verbose > 0: voutput += "Solr query: %s<br/>" % query try: (ranked_result, matched_recs) = solr_get_ranked(query, hitset, params, ranked_result_amount) except: register_exception() return ( None, "Records not ranked. An error occurred. Please check the query.", "", voutput) if verbose > 0: voutput += "All matched records: %s<br/>" % matched_recs # Considers not ranked records not_ranked = hitset.difference(matched_recs) if not_ranked: lrecIDs = list(not_ranked) ranked_result = zip(lrecIDs, [0] * len(lrecIDs)) + ranked_result if verbose > 0: voutput += "Not ranked: %s<br/>" % not_ranked # Similar-to-recid requires reverse order if search_units[0][2] == 'recid': ranked_result.reverse() return (ranked_result, params["prefix"], params["postfix"], voutput)
def word_similarity_solr(pattern, hitset, params, verbose, explicit_field, ranked_result_amount): """ Ranking a records containing specified words and returns a sorted list. input: hitset - a list of hits for the query found by search_engine verbose - verbose value explicit_field - field to search (selected in GUI) ranked_result_amount - amount of results to be ranked output: recset - a list of sorted records: [[23,34], [344,24], [1,01]] prefix - what to show before the rank value postfix - what to show after the rank value voutput - contains extra information, content dependent on verbose value """ voutput = "" search_units = [] if not len(hitset): return ([], "", "", voutput) if pattern: pattern = " ".join(map(str, pattern)) from invenio.legacy.search_engine import create_basic_search_units search_units = create_basic_search_units(None, pattern, explicit_field) else: return (None, "Records not ranked. The query is not detailed enough, or not enough records found, for ranking to be possible.", "", voutput) if verbose > 0: voutput += "Hitset: %s<br/>" % hitset voutput += "Pattern: %s<br/>" % pattern voutput += "Search units: %s<br/>" % search_units query = "" (ranked_result, matched_recs) = (None, None) # Ranks similar records if search_units[0][2] == 'recid': recid = search_units[0][1] if verbose > 0: voutput += "Ranked amount: %s<br/>" % ranked_result_amount try: (ranked_result, matched_recs) = solr_get_similar_ranked(recid, hitset, params, ranked_result_amount) except: register_exception() return (None, "Records not ranked. An error occurred. Please check the query.", "", voutput) # Cutoffs potentially large hitset it = itertools.islice(hitset, params['find_similar_to_recid']['hitset_cutoff']) hitset = intbitset(list(it)) # Regular word similarity ranking else: for (operator, pattern, field, unit_type) in search_units: # Any field if field == '': field = 'global' # Field might not exist elif field not in params["fields"].keys(): field = params["default_field"] if unit_type == "a": # Eliminates leading and trailing % if pattern[0] == "%": pattern = pattern[1:-1] pattern = "\"" + pattern + "\"" weighting = "^" + str(params["fields"][field]["weight"]) if ':' in pattern: pattern = pattern.rsplit(':', 1)[1] query_part = field + ":" + pattern + weighting # Considers boolean operator from the second part on, allows negation from the first part on if query or operator == "-": query += " " + BOOLEAN_EQUIVALENTS[operator] + " " query += query_part + " " if verbose > 0: voutput += "Solr query: %s<br/>" % query try: (ranked_result, matched_recs) = solr_get_ranked(query, hitset, params, ranked_result_amount) except: register_exception() return (None, "Records not ranked. An error occurred. Please check the query.", "", voutput) if verbose > 0: voutput += "All matched records: %s<br/>" % matched_recs # Considers not ranked records not_ranked = hitset.difference(matched_recs) if not_ranked: lrecIDs = list(not_ranked) ranked_result = zip(lrecIDs, [0] * len(lrecIDs)) + ranked_result if verbose > 0: voutput += "Not ranked: %s<br/>" % not_ranked # Similar-to-recid requires reverse order if search_units[0][2] == 'recid': ranked_result.reverse() return (ranked_result, params["prefix"], params["postfix"], voutput)
def word_similarity_xapian(pattern, hitset, params, verbose, field, ranked_result_amount): """ Ranking a records containing specified words and returns a sorted list. input: hitset - a list of hits for the query found by search_engine verbose - verbose value field - field to search (selected in GUI) ranked_result_amount - amount of results to be ranked output: recset - a list of sorted records: [[23,34], [344,24], [1,01]] prefix - what to show before the rank value postfix - what to show after the rank value voutput - contains extra information, content dependent on verbose value """ voutput = "" search_units = [] if pattern: xapian_init_databases() pattern = " ".join(map(str, pattern)) from invenio.legacy.search_engine import create_basic_search_units search_units = create_basic_search_units(None, pattern, field) if verbose > 0: voutput += "Hitset: %s<br/>" % hitset voutput += "Pattern: %s<br/>" % pattern voutput += "Search units: %s<br/>" % search_units all_ranked_results = [] included_hits = intbitset() excluded_hits = intbitset() for (operator, pattern, field, unit_type) in search_units: #@UnusedVariable # Field might not exist if field not in params["fields"].keys(): field = params["default_field"] if unit_type == "a": # Eliminates leading and trailing % if pattern[0] == "%": pattern = pattern[1:-1] pattern = "\"" + pattern + "\"" (ranked_result_part, matched_recs) = xapian_get_ranked_index(field, pattern, params["fields"][field], hitset, ranked_result_amount) if verbose > 0: voutput += "Index %s: %s<br/>" % (field, ranked_result_part) voutput += "Index records %s: %s<br/>" % (field, matched_recs) # Excludes - results if operator == "-": excluded_hits = excluded_hits.union(matched_recs) # + and | are interpreted as OR else: included_hits = included_hits.union(matched_recs) all_ranked_results.extend(ranked_result_part) ranked_result = [] if hitset: # Removes the excluded records result_hits = included_hits.difference(excluded_hits) # Avoids duplicate results and normalises scores ranked_result = get_greatest_ranked_records(all_ranked_results) ranked_result = get_normalized_ranking_scores(ranked_result) # Considers not ranked records not_ranked = hitset.difference(result_hits) if not_ranked: lrecIDs = list(not_ranked) ranked_result = zip(lrecIDs, [0] * len(lrecIDs)) + ranked_result if verbose > 0: voutput += "All matched records: %s<br/>" % result_hits voutput += "All ranked records: %s<br/>" % ranked_result voutput += "All not ranked records: %s<br/>" % not_ranked ranked_result.sort(lambda x, y: cmp(x[1], y[1])) return (ranked_result, params["prefix"], params["postfix"], voutput) return (ranked_result, "", "", voutput)
def format_template_show_preview_or_save(req, bft, ln=CFG_SITE_LANG, code=None, ln_for_preview=CFG_SITE_LANG, pattern_for_preview="", content_type_for_preview='text/html', save_action=None, navtrail=""): """ Print the preview of a record with a format template. To be included inside Format template editor. If the save_action has a value, then the code should also be saved at the same time @param req: the request object @param code: the code of a template to use for formatting @param ln: language @param ln_for_preview: the language for the preview (for bfo) @param pattern_for_preview: the search pattern to be used for the preview (for bfo) @param content_type_for_preview: the content-type to use to serve the preview page @param save_action: has a value if the code has to be saved @param bft: the filename of the template to save @param navtrail: navigation trail @return: a web page """ ln = wash_language(ln) _ = gettext_set_language(ln) (auth_code, auth_msg) = check_user(req, 'cfgbibformat') if not auth_code: user_info = collect_user_info(req) uid = user_info['uid'] bft = wash_url_argument(bft, 'str') if save_action is not None and code is not None: #save bibformatadminlib.update_format_template_code(bft, code=code) bibformat_engine.clear_caches() if code is None: code = bibformat_engine.get_format_template(bft)['code'] ln_for_preview = wash_language(ln_for_preview) pattern_for_preview = wash_url_argument(pattern_for_preview, 'str') if pattern_for_preview == "": try: recID = search_pattern(p='-collection:DELETED').pop() except KeyError: return page(title="No Document Found", body="", uid=uid, language=ln_for_preview, navtrail = "", lastupdated=__lastupdated__, req=req, navmenuid='search') pattern_for_preview = "recid:%s" % recID else: try: recID = search_pattern(p=pattern_for_preview + \ ' -collection:DELETED').pop() except KeyError: return page(title="No Record Found for %s" % pattern_for_preview, body="", uid=uid, language=ln_for_preview, navtrail = "", lastupdated=__lastupdated__, req=req) units = create_basic_search_units(None, pattern_for_preview, None) keywords = [unit[1] for unit in units if unit[0] != '-'] bfo = bibformat_engine.BibFormatObject(recID = recID, ln = ln_for_preview, search_pattern = keywords, xml_record = None, user_info = user_info) body = bibformat_engine.format_with_format_template(bft, bfo, verbose=7, format_template_code=code) if content_type_for_preview == 'text/html': #Standard page display with CDS headers, etc. return page(title="", body=body, uid=uid, language=ln_for_preview, navtrail = navtrail, lastupdated=__lastupdated__, req=req, navmenuid='search') else: #Output with chosen content-type. req.content_type = content_type_for_preview req.send_http_header() req.write(body) else: return page_not_authorized(req=req, text=auth_msg)