def get_pdf_snippets(recID, patterns, user_info): """ Extract text snippets around 'patterns' from the newest PDF file of 'recID' The search is case-insensitive. The snippets are meant to look like in the results of the popular search engine: using " ... " between snippets. For empty patterns it returns "" @param recID: record ID to consider @param patterns: list of patterns to retrieve @param nb_words_around: max number of words around the matched pattern @param max_snippets: max number of snippets to include @return: snippet """ from invenio.bibdocfile import BibRecDocs, check_bibdoc_authorization text_path = "" text_path_courtesy = "" for bd in BibRecDocs(recID).list_bibdocs(): # Show excluded fulltext in snippets on Inspire, otherwise depending on authorization if bd.get_text() and (CFG_INSPIRE_SITE or not check_bibdoc_authorization(user_info, bd.get_status())[0]): text_path = bd.get_text_path() text_path_courtesy = bd.get_status() if CFG_INSPIRE_SITE and not text_path_courtesy: # get courtesy from doctype, since docstatus was empty: text_path_courtesy = bd.get_type() if text_path_courtesy == 'INSPIRE-PUBLIC': # but ignore 'INSPIRE-PUBLIC' doctype text_path_courtesy = '' break # stop at the first good PDF textable file nb_chars = CFG_WEBSEARCH_FULLTEXT_SNIPPETS_CHARS.get('', 0) max_snippets = CFG_WEBSEARCH_FULLTEXT_SNIPPETS.get('', 0) if CFG_WEBSEARCH_FULLTEXT_SNIPPETS_CHARS.has_key(text_path_courtesy): nb_chars=CFG_WEBSEARCH_FULLTEXT_SNIPPETS_CHARS[text_path_courtesy] if CFG_WEBSEARCH_FULLTEXT_SNIPPETS.has_key(text_path_courtesy): max_snippets=CFG_WEBSEARCH_FULLTEXT_SNIPPETS[text_path_courtesy] if text_path and nb_chars and max_snippets: out = '' if CFG_WEBSEARCH_FULLTEXT_SNIPPETS_GENERATOR == 'native': out = get_text_snippets(text_path, patterns, nb_chars, max_snippets) if not out: # no hit, so check stemmed versions: from invenio.bibindex_engine_stemmer import stem stemmed_patterns = [stem(p, 'en') for p in patterns] out = get_text_snippets(text_path, stemmed_patterns, nb_chars, max_snippets) elif CFG_WEBSEARCH_FULLTEXT_SNIPPETS_GENERATOR == 'SOLR': out = solr_get_snippet(patterns, recID, nb_chars, max_snippets) if out: out_courtesy = "" if CFG_INSPIRE_SITE and text_path_courtesy: out_courtesy = '<strong>Snippets courtesy of ' + text_path_courtesy + '</strong><br>' return """<div class="snippetbox">%s%s</div>""" % (out_courtesy, out) else: return "" else: return ""
def get_pdf_snippets(recID, patterns, user_info): """ Extract text snippets around 'patterns' from the newest PDF file of 'recID' The search is case-insensitive. The snippets are meant to look like in the results of the popular search engine: using " ... " between snippets. For empty patterns it returns "" @param recID: record ID to consider @param patterns: list of patterns to retrieve @param nb_words_around: max number of words around the matched pattern @param max_snippets: max number of snippets to include @return: snippet """ from invenio.bibdocfile import BibRecDocs, check_bibdoc_authorization text_path = "" text_path_courtesy = "" for bd in BibRecDocs(recID).list_bibdocs(): # Show excluded fulltext in snippets on Inspire, otherwise depending on authorization if bd.get_text() and (CFG_INSPIRE_SITE or not check_bibdoc_authorization(user_info, bd.get_status())[0]): text_path = bd.get_text_path() text_path_courtesy = bd.get_status() if CFG_INSPIRE_SITE and not text_path_courtesy: # get courtesy from doctype, since docstatus was empty: text_path_courtesy = bd.get_type() if text_path_courtesy == 'INSPIRE-PUBLIC': # but ignore 'INSPIRE-PUBLIC' doctype text_path_courtesy = '' break # stop at the first good PDF textable file nb_chars = CFG_WEBSEARCH_FULLTEXT_SNIPPETS_CHARS.get('', 0) max_snippets = CFG_WEBSEARCH_FULLTEXT_SNIPPETS_CHARS.get('', 0) if CFG_WEBSEARCH_FULLTEXT_SNIPPETS_CHARS.has_key(text_path_courtesy): nb_chars=CFG_WEBSEARCH_FULLTEXT_SNIPPETS_CHARS[text_path_courtesy] if CFG_WEBSEARCH_FULLTEXT_SNIPPETS.has_key(text_path_courtesy): max_snippets=CFG_WEBSEARCH_FULLTEXT_SNIPPETS[text_path_courtesy] if text_path and nb_chars and max_snippets: out = '' if CFG_WEBSEARCH_FULLTEXT_SNIPPETS_GENERATOR == 'native': out = get_text_snippets(text_path, patterns, nb_chars, max_snippets) if not out: # no hit, so check stemmed versions: from invenio.bibindex_engine_stemmer import stem stemmed_patterns = [stem(p, 'en') for p in patterns] out = get_text_snippets(text_path, stemmed_patterns, nb_chars, max_snippets) elif CFG_WEBSEARCH_FULLTEXT_SNIPPETS_GENERATOR == 'SOLR': out = solr_get_snippet(patterns, recID, nb_chars, max_snippets) if out: out_courtesy = "" if CFG_INSPIRE_SITE and text_path_courtesy: out_courtesy = '<strong>Snippets courtesy of ' + text_path_courtesy + '</strong><br>' return """<div class="snippetbox">%s%s</div>""" % (out_courtesy, out) else: return "" else: return ""
def apply_stemming(word, stemming_language): """Returns word after applying stemming (if stemming language is set). You can change your stemming language in database. @param word: word to be checked @type word: str @param stemming_language: abbreviation of language or None @type stemming_language: str """ if stemming_language: word = stem(word, stemming_language) return word
def get_pdf_snippets(recID, patterns, nb_words_around=CFG_WEBSEARCH_FULLTEXT_SNIPPETS_WORDS, max_snippets=CFG_WEBSEARCH_FULLTEXT_SNIPPETS): """ Extract text snippets around 'patterns' from the newest PDF file of 'recID' The search is case-insensitive. The snippets are meant to look like in the results of the popular search engine: using " ... " between snippets. For empty patterns it returns "" @param recID: record ID to consider @param patterns: list of patterns to retrieve @param nb_words_around: max number of words around the matched pattern @param max_snippets: max number of snippets to include @return: snippet """ from invenio.bibdocfile import BibRecDocs text_path = "" text_path_courtesy = "" for bd in BibRecDocs(recID).list_bibdocs(): if bd.get_text(): text_path = bd.get_text_path() text_path_courtesy = bd.get_status() if CFG_INSPIRE_SITE and not text_path_courtesy: # get courtesy from doctype, since docstatus was empty: text_path_courtesy = bd.get_type() if text_path_courtesy == 'INSPIRE-PUBLIC': # but ignore 'INSPIRE-PUBLIC' doctype text_path_courtesy = '' break # stop at the first good PDF textable file if text_path: out = get_text_snippets(text_path, patterns, nb_words_around, max_snippets) if not out: # no hit, so check stemmed versions: from invenio.bibindex_engine_stemmer import stem stemmed_patterns = [stem(p, 'en') for p in patterns] out = get_text_snippets(text_path, stemmed_patterns, nb_words_around, max_snippets, False) if out: out_courtesy = "" if text_path_courtesy: out_courtesy = '<strong>Snippets courtesy of ' + text_path_courtesy + '</strong><br>' return """<div class="snippetbox">%s%s</div>""" % (out_courtesy, out) else: return "" else: return ""
def apply_stemming_and_stopwords_and_length_check(word, stemming_language): """Return WORD after applying stemming and stopword and length checks. See the config file in order to influence these. """ # now check against stopwords: if is_stopword(word): return "" # finally check the word length: if len(word) < CFG_BIBINDEX_MIN_WORD_LENGTH: return "" # stem word, when configured so: if stemming_language: word = stem(word, stemming_language) return word
def answer(self, req, user_info, of, cc, colls_to_search, p, f, search_units, ln): """ Answer question given by context, using knowledge base returned by L{get_kbname}. Return (relevance, html_string) where relevance is integer from 0 to 100 indicating how relevant to the question the answer is (see L{CFG_WEBSEARCH_SERVICE_MAX_SERVICE_ANSWER_RELEVANCE} for details) , and html_string being a formatted answer. """ _ = gettext_set_language(ln) # words = [stem(unit[1], ln) for unit in search_units if unit[2] == ''] words = [ stem(unit[1].lower(), CFG_SITE_LANG) for unit in search_units if unit[2] == '' ] cache = self.get_data_cache() matching_values = {} for word in words: res = cache.get(word, []) for keyword in res: if not matching_values.has_key(keyword): matching_values[keyword] = 1 else: matching_values[keyword] += 1 # order matching values per score matching_values_sorted = sorted(matching_values.iteritems(), key=lambda (k, v): (v, k), reverse=True) if not matching_values_sorted: return (0, '') best_score = matching_values_sorted[0][1] # Compute relevance. How many words from query did match relevance = min( 100, max(0, (100 * float(best_score) / len([word for word in words if len(word) > 3])) - 10)) labels_and_links = [m.split("|", 1) for m in matching_values.keys()] translated_labels_and_links = [(_(label), url) for label, url in labels_and_links] return (relevance, self.display_answer_helper(translated_labels_and_links, ln))
def clean_and_split_words_and_stem(string, ln=CFG_SITE_LANG, stem_p=True): """ Helper function to split and stemp words in a string. @param ln: language to consider for stemming @param stem_p: if True, also stem the word according to C{ln} @return: list of (stemmed) word. """ alphanum_string = re_non_alphanum_only.sub(" ", string).lower() words = re_split_words_pattern.split(alphanum_string) if stem_p: # lowering must be done after stemming words = [stem(word, ln) for word in words] return words
def test_stemmer_greek(self): """bibindex engine - Greek stemmer""" greek_test_cases = [['πληροφορίες', 'ΠΛΗΡΟΦΟΡΙ'], ['πείραμα', 'ΠΕΙΡΑΜ'], ['πειράματα', 'ΠΕΙΡΑΜ'], ['πειραματιστής', 'ΠΕΙΡΑΜΑΤΙΣΤ'], ['πειραματίζομαι', 'ΠΕΙΡΑΜΑΤΙΖ'], ['πειραματίζεσαι', 'ΠΕΙΡΑΜΑΤΙΖ'], ['πειραματίστηκα', 'ΠΕΙΡΑΜΑΤΙΣΤ'], ['πειραματόζωο', 'ΠΕΙΡΑΜΑΤΟΖΩ'], ['ζώο', 'ΖΩ'], ['πειραματισμός', 'ΠΕΙΡΑΜΑΤΙΣΜ'], ['πειραματικός', 'ΠΕΙΡΑΜΑΤΙΚ'], ['πειραματικά', 'ΠΕΙΡΑΜΑΤ'], ['ηλεκτρόνιο', 'ΗΛΕΚΤΡΟΝΙ'], ['ηλεκτρονιακός', 'ΗΛΕΚΤΡΟΝΙΑΚ'], ['ακτίνα', 'ΑΚΤΙΝ'], ['ακτινοβολία', 'ΑΚΤΙΝΟΒΟΛ'], ['E=mc^2', 'E=MC^2'], ['α+β=γ', 'Α+Β=Γ']] for test_word, expected_result in greek_test_cases: self.assertEqual(expected_result, bibindex_engine_stemmer.stem(test_word, "el"))
def prepare_data_cache(self): """ "Index" collection names """ from invenio.search_engine import collection_i18nname_cache cache = {} words_and_coll = [(' '.join([' '.join([stem(word.lower(), ln) for word in \ whitespace_re.split(non_alphanum_chars_only_re.sub(' ', translation))]) for ln, translation in \ translations.iteritems()]), coll_name) for coll_name, translations in \ collection_i18nname_cache.cache.iteritems()] for words, coll in words_and_coll: for word in whitespace_re.split(words): if not word.strip(): continue if not cache.has_key(word): cache[word] = [] if not coll in cache[word]: cache[word].append(coll) return cache
def get_pdf_snippets(recID, patterns, nb_words_around=CFG_WEBSEARCH_FULLTEXT_SNIPPETS_WORDS, max_snippets=CFG_WEBSEARCH_FULLTEXT_SNIPPETS): """ Extract text snippets around 'patterns' from the newest PDF file of 'recID' The search is case-insensitive. The snippets are meant to look like in the results of the popular search engine: using " ... " between snippets. For empty patterns it returns "" """ from invenio.bibdocfile import BibRecDocs text_path = "" text_path_courtesy = "" for bd in BibRecDocs(recID).list_bibdocs(): if bd.get_text(): text_path = bd.get_text_path() text_path_courtesy = bd.get_status() if not text_path_courtesy and CFG_INSPIRE_SITE: text_path_courtesy = bd.get_type() break # stop at the first good PDF textable file if text_path: out = get_text_snippets(text_path, patterns, nb_words_around, max_snippets) if not out: # no hit, so check stemmed versions: from invenio.bibindex_engine_stemmer import stem stemmed_patterns = [stem(p, 'en') for p in patterns] out = get_text_snippets(text_path, stemmed_patterns, nb_words_around, max_snippets, False) if out: out_courtesy = "" if text_path_courtesy: out_courtesy = '<strong>Snippets courtesy of ' + text_path_courtesy + '</strong><br>' return """<div class="snippetbox">%s%s</div>""" % (out_courtesy, out) else: return "" else: return ""
def test_stemmer_english(self): """bibindex engine - English stemmer""" english_test_cases = [['information', 'inform'], ['experiment', 'experi'], ['experiments', 'experi'], ['experimented', 'experi'], ['experimenting', 'experi'], ['experimental', 'experiment'], ['experimentally', 'experiment'], ['experimentation', 'experiment'], ['experimentalism', 'experiment'], ['experimenter', 'experiment'], ['experimentalise', 'experimentalis'], ['experimentalist', 'experimentalist'], ['experimentalists', 'experimentalist'], ['GeV', 'GeV'], ['$\Omega$', '$\Omega$'], ['e^-', 'e^-'], ['C#', 'C#'], ['C++', 'C++']] for test_word, expected_result in english_test_cases: self.assertEqual(expected_result, bibindex_engine_stemmer.stem(test_word, "en"))
def answer(self, req, user_info, of, cc, colls_to_search, p, f, search_units, ln): """ Answer question given by context, using knowledge base returned by L{get_kbname}. Return (relevance, html_string) where relevance is integer from 0 to 100 indicating how relevant to the question the answer is (see L{CFG_WEBSEARCH_SERVICE_MAX_SERVICE_ANSWER_RELEVANCE} for details) , and html_string being a formatted answer. """ _ = gettext_set_language(ln) # words = [stem(unit[1], ln) for unit in search_units if unit[2] == ''] words = [stem(unit[1].lower(), CFG_SITE_LANG) for unit in search_units if unit[2] == ''] cache = self.get_data_cache() matching_values = {} for word in words: res = cache.get(word, []) for keyword in res: if not matching_values.has_key(keyword): matching_values[keyword] = 1 else: matching_values[keyword] += 1 # order matching values per score matching_values_sorted = sorted(matching_values.iteritems(), key=lambda (k, v): (v, k), reverse=True) if not matching_values_sorted: return (0, '') best_score = matching_values_sorted[0][1] # Compute relevance. How many words from query did match relevance = min(100, max(0, (100 * float(best_score) / len([word for word in words if len(word) > 3 ])) - 10)) labels_and_links = [m.split("|", 1) for m in matching_values.keys()] translated_labels_and_links = [(_(label), url) for label, url in labels_and_links] return (relevance, self.display_answer_helper(translated_labels_and_links, ln))
def word_similarity(rank_method_code, lwords, hitset, rank_limit_relevance, verbose, methods): """Ranking a records containing specified words and returns a sorted list. input: rank_method_code - the code of the method, from the name field in rnkMETHOD lwords - a list of words from the query hitset - a list of hits for the query found by search_engine rank_limit_relevance - show only records with a rank value above this verbose - verbose value output: reclist - a list of sorted records: [[23,34], [344,24], [1,01]] prefix - what to show before the rank value postfix - what to show after the rank value voutput - contains extra information, content dependent on verbose value""" voutput = "" startCreate = time.time() if verbose > 0: voutput += ( "<br />Running rank method: %s, using word_frequency function in bibrank_record_sorter<br />" % rank_method_code ) lwords_old = lwords lwords = [] # Check terms, remove non alphanumeric characters. Use both unstemmed and stemmed version of all terms. for i in range(0, len(lwords_old)): term = string.lower(lwords_old[i]) if ( not methods[rank_method_code]["stopwords"] == "True" or methods[rank_method_code]["stopwords"] and not is_stopword(term, 1) ): lwords.append((term, methods[rank_method_code]["rnkWORD_table"])) terms = string.split( string.lower(re.sub(methods[rank_method_code]["chars_alphanumericseparators"], " ", term)) ) for term in terms: if methods[rank_method_code].has_key("stemmer"): # stem word term = stem(string.replace(term, " ", ""), methods[rank_method_code]["stemmer"]) if lwords_old[i] != term: # add if stemmed word is different than original word lwords.append((term, methods[rank_method_code]["rnkWORD_table"])) (recdict, rec_termcount, lrecIDs_remove) = ({}, {}, {}) # For each term, if accepted, get a list of the records using the term # calculate then relevance for each term before sorting the list of records for (term, table) in lwords: term_recs = run_sql( """SELECT term, hitlist FROM %s WHERE term=%%s""" % methods[rank_method_code]["rnkWORD_table"], (term,) ) if term_recs: # if term exists in database, use for ranking term_recs = deserialize_via_marshal(term_recs[0][1]) (recdict, rec_termcount) = calculate_record_relevance( (term, int(term_recs["Gi"][1])), term_recs, hitset, recdict, rec_termcount, verbose, quick=None ) del term_recs if len(recdict) == 0 or (len(lwords) == 1 and lwords[0] == ""): return ( None, "Records not ranked. The query is not detailed enough, or not enough records found, for ranking to be possible.", "", voutput, ) else: # sort if we got something to sort (reclist, hitset) = sort_record_relevance(recdict, rec_termcount, hitset, rank_limit_relevance, verbose) # Add any documents not ranked to the end of the list if hitset: lrecIDs = list(hitset) # using 2-3mb reclist = zip(lrecIDs, [0] * len(lrecIDs)) + reclist # using 6mb if verbose > 0: voutput += "<br />Current number of recIDs: %s<br />" % (methods[rank_method_code]["col_size"]) voutput += ( "Number of terms: %s<br />" % run_sql("SELECT count(id) FROM %s" % methods[rank_method_code]["rnkWORD_table"])[0][0] ) voutput += "Terms: %s<br />" % lwords voutput += "Prepare and pre calculate time: %s<br />" % (str(time.time() - startCreate)) voutput += "Total time used: %s<br />" % (str(time.time() - startCreate)) voutput += str(reclist) + "<br />" rank_method_stat(rank_method_code, reclist, lwords) return (reclist, methods[rank_method_code]["prefix"], methods[rank_method_code]["postfix"], voutput)
def test_stemmer_none(self): """bibindex engine - no stemmer""" self.assertEqual("information", bibindex_engine_stemmer.stem("information", None))
def answer(self, req, user_info, of, cc, colls_to_search, p, f, search_units, ln): """ Answer question given by context. Return (relevance, html_string) where relevance is integer from 0 to 100 indicating how relevant to the question the answer is (see C{CFG_WEBSEARCH_SERVICE_MAX_SERVICE_ANSWER_RELEVANCE} for details) , and html_string being a formatted answer. """ from invenio.search_engine import \ get_permitted_restricted_collections, \ get_coll_i18nname, \ collection_i18nname_cache, \ collection_restricted_p _ = gettext_set_language(ln) # stem search units. remove those with field # TODO: search in hosted collection names too # TODO: ignore unattached trees # TODO: use synonyms if f or (CFG_WEBSEARCH_COLLECTION_NAMES_SEARCH < 0) or \ (CFG_WEBSEARCH_COLLECTION_NAMES_SEARCH == 0 and cc != CFG_SITE_NAME): return (0, '') words = [ stem(unit[1], ln) for unit in search_units if unit[2] in ('', 'collection') ] # Stemming if not words: return (0, '') permitted_restricted_collections = get_permitted_restricted_collections( user_info) cache = self.get_data_cache() matching_collections = {} for word in words: if CFG_CERN_SITE and word == 'cern': # This keyword is useless here... continue colls = cache.get(word.lower(), []) for coll in colls: if collection_restricted_p(coll) and \ not coll in permitted_restricted_collections: # Skip restricted collection user do not have access continue if not matching_collections.has_key(coll): matching_collections[coll] = 0 matching_collections[coll] += 1 matching_collections_sorted = sorted(matching_collections.iteritems(), key=lambda (k, v): (v, k), reverse=True) if not matching_collections_sorted: return (0, '') matching_collections_names = [(get_coll_i18nname(coll, ln, False), CFG_SITE_URL + '/collection/' + urllib.quote(coll, safe='') + '?ln=en') \ for coll, score in matching_collections_sorted] best_score = matching_collections_sorted[0][1] best_coll_words = whitespace_re.split( matching_collections_sorted[0][0]) relevance = min( 100, max(0, (100 * float(2 * best_score) / float(len(best_coll_words) + len(words)) - 10))) if (('submit' in p.lower()) or (_('submit') in p.lower())) and \ not (('submit' in best_coll_words) or (_('submit') in best_coll_words)): # User is probably looking for a submission. Decrease relevance relevance = max(0, relevance - 30) return (relevance, self.display_answer_helper(matching_collections_names, ln))
def answer(self, req, user_info, of, cc, colls_to_search, p, f, search_units, ln): """ Answer question given by context. Return (relevance, html_string) where relevance is integer from 0 to 100 indicating how relevant to the question the answer is (see C{CFG_WEBSEARCH_SERVICE_MAX_SERVICE_ANSWER_RELEVANCE} for details) , and html_string being a formatted answer. """ _ = gettext_set_language(ln) if f or (CFG_WEBSEARCH_COLLECTION_NAMES_SEARCH < 0) or \ (CFG_WEBSEARCH_COLLECTION_NAMES_SEARCH == 0 and cc != CFG_SITE_NAME): return (0, '') words = [stem(unit[1].lower(), CFG_SITE_LANG) for unit in search_units if unit[2] == ''] if not words: return (0, '') cache = self.get_data_cache() # TODO: If all categories of a submission match, display only submission (not categories) matching_submissions = {} for word in words: # Look for submission names if CFG_CERN_SITE and word == 'cern': # This keyword is useless here... continue submissions = cache.get(word, []) for doctype, submission_label, category in submissions: if acc_authorize_action(req, 'submit', \ authorized_if_no_roles=not isGuestUser(user_info['uid']), \ doctype=(CFG_CERN_SITE and doctype.startswith('GENSBM#') and 'GENSBM') or doctype, categ=category)[0] != 0: # Not authorized to submit in this submission continue if not matching_submissions.has_key((doctype, submission_label)): matching_submissions[(doctype, submission_label)] = 0 add_score = 1 if category != '*': # This is the submission category, consider that # words that are part of the submission name are # less important than others here: if not word.lower() in category.lower(): # word is only in submission name add_score = 0.5 else: add_score = 1.5 matching_submissions[(doctype, submission_label)] += add_score matching_submissions_sorted = sorted(matching_submissions.iteritems(), key=lambda (k, v): (v, k), reverse=True) if not matching_submissions_sorted: return (0, '') best_score = matching_submissions_sorted[0][1] max_score_difference = 1.9 matching_submissions_names = [(submission_label, \ CFG_SITE_URL + '/submit?doctype=' + doctype.split("#", 1)[0] + '&ln=' + ln + (CFG_CERN_SITE and doctype.startswith('GENSBM#') and '#' + doctype.split("#", 1)[-1] or '') ) \ for (doctype, submission_label), score in matching_submissions_sorted if score > best_score - max_score_difference] best_sbm_words = whitespace_re.split(matching_submissions_sorted[0][0][1]) score_bonus = (((_("Submit").lower() in words) or ("submit" in words)) or \ ((_("Revise").lower() in words) or ("revise" in words)) or \ ((_("Modify").lower() in words) or ("modify" in words))) and 40 or 0 relevance = min(100, max(0, (score_bonus + (100 * float(best_score) / float(len(best_sbm_words) + len(words)))) - 10)) return (relevance, self.display_answer_helper(matching_submissions_names, ln))
def word_similarity(rank_method_code, lwords, hitset, rank_limit_relevance, verbose, methods): """Ranking a records containing specified words and returns a sorted list. input: rank_method_code - the code of the method, from the name field in rnkMETHOD lwords - a list of words from the query hitset - a list of hits for the query found by search_engine rank_limit_relevance - show only records with a rank value above this verbose - verbose value output: reclist - a list of sorted records: [[23,34], [344,24], [1,01]] prefix - what to show before the rank value postfix - what to show after the rank value voutput - contains extra information, content dependent on verbose value""" voutput = "" startCreate = time.time() if verbose > 0: voutput += "<br />Running rank method: %s, using word_frequency function in bibrank_record_sorter<br />" % rank_method_code lwords_old = lwords lwords = [] #Check terms, remove non alphanumeric characters. Use both unstemmed and stemmed version of all terms. for i in range(0, len(lwords_old)): term = string.lower(lwords_old[i]) if not methods[rank_method_code]["stopwords"] == "True" or methods[rank_method_code]["stopwords"] and not is_stopword(term): lwords.append((term, methods[rank_method_code]["rnkWORD_table"])) terms = string.split(string.lower(re.sub(methods[rank_method_code]["chars_alphanumericseparators"], ' ', term))) for term in terms: if methods[rank_method_code].has_key("stemmer"): # stem word term = stem(string.replace(term, ' ', ''), methods[rank_method_code]["stemmer"]) if lwords_old[i] != term: #add if stemmed word is different than original word lwords.append((term, methods[rank_method_code]["rnkWORD_table"])) (recdict, rec_termcount, lrecIDs_remove) = ({}, {}, {}) #For each term, if accepted, get a list of the records using the term #calculate then relevance for each term before sorting the list of records for (term, table) in lwords: term_recs = run_sql("""SELECT term, hitlist FROM %s WHERE term=%%s""" % methods[rank_method_code]["rnkWORD_table"], (term,)) if term_recs: #if term exists in database, use for ranking term_recs = deserialize_via_marshal(term_recs[0][1]) (recdict, rec_termcount) = calculate_record_relevance((term, int(term_recs["Gi"][1])) , term_recs, hitset, recdict, rec_termcount, verbose, quick=None) del term_recs if len(recdict) == 0 or (len(lwords) == 1 and lwords[0] == ""): return (None, "Records not ranked. The query is not detailed enough, or not enough records found, for ranking to be possible.", "", voutput) else: #sort if we got something to sort (reclist, hitset) = sort_record_relevance(recdict, rec_termcount, hitset, rank_limit_relevance, verbose) #Add any documents not ranked to the end of the list if hitset: lrecIDs = list(hitset) #using 2-3mb reclist = zip(lrecIDs, [0] * len(lrecIDs)) + reclist #using 6mb if verbose > 0: voutput += "<br />Current number of recIDs: %s<br />" % (methods[rank_method_code]["col_size"]) voutput += "Number of terms: %s<br />" % run_sql("SELECT count(id) FROM %s" % methods[rank_method_code]["rnkWORD_table"])[0][0] voutput += "Terms: %s<br />" % lwords voutput += "Prepare and pre calculate time: %s<br />" % (str(time.time() - startCreate)) voutput += "Total time used: %s<br />" % (str(time.time() - startCreate)) voutput += str(reclist) + "<br />" rank_method_stat(rank_method_code, reclist, lwords) return (reclist, methods[rank_method_code]["prefix"], methods[rank_method_code]["postfix"], voutput)
def answer(self, req, user_info, of, cc, colls_to_search, p, f, search_units, ln): """ Answer question given by context. Return (relevance, html_string) where relevance is integer from 0 to 100 indicating how relevant to the question the answer is (see C{CFG_WEBSEARCH_SERVICE_MAX_SERVICE_ANSWER_RELEVANCE} for details) , and html_string being a formatted answer. """ from invenio.search_engine import \ get_permitted_restricted_collections, \ get_coll_i18nname, \ collection_i18nname_cache, \ collection_restricted_p _ = gettext_set_language(ln) # stem search units. remove those with field # TODO: search in hosted collection names too # TODO: ignore unattached trees # TODO: use synonyms if f or (CFG_WEBSEARCH_COLLECTION_NAMES_SEARCH < 0) or \ (CFG_WEBSEARCH_COLLECTION_NAMES_SEARCH == 0 and cc != CFG_SITE_NAME): return (0, '') words = [stem(unit[1], ln) for unit in search_units if unit[2] in ('', 'collection')] # Stemming if not words: return (0, '') permitted_restricted_collections = get_permitted_restricted_collections(user_info) cache = self.get_data_cache() matching_collections = {} for word in words: if CFG_CERN_SITE and word == 'cern': # This keyword is useless here... continue colls = cache.get(word.lower(), []) for coll in colls: if collection_restricted_p(coll) and \ not coll in permitted_restricted_collections: # Skip restricted collection user do not have access continue if not matching_collections.has_key(coll): matching_collections[coll] = 0 matching_collections[coll] += 1 matching_collections_sorted = sorted(matching_collections.iteritems(), key=lambda (k, v): (v, k), reverse=True) if not matching_collections_sorted: return (0, '') matching_collections_names = [(get_coll_i18nname(coll, ln, False), CFG_SITE_URL + '/collection/' + urllib.quote(coll, safe='') + '?ln=en') \ for coll, score in matching_collections_sorted] best_score = matching_collections_sorted[0][1] best_coll_words = whitespace_re.split(matching_collections_sorted[0][0]) relevance = min(100, max(0, (100 * float(2 * best_score) / float(len(best_coll_words) + len(words)) - 10))) if (('submit' in p.lower()) or (_('submit') in p.lower())) and \ not (('submit' in best_coll_words) or (_('submit') in best_coll_words)): # User is probably looking for a submission. Decrease relevance relevance = max(0, relevance - 30) return (relevance, self.display_answer_helper(matching_collections_names, ln))
def answer(self, req, user_info, of, cc, colls_to_search, p, f, search_units, ln): """ Answer question given by context. Return (relevance, html_string) where relevance is integer from 0 to 100 indicating how relevant to the question the answer is (see C{CFG_WEBSEARCH_SERVICE_MAX_SERVICE_ANSWER_RELEVANCE} for details) , and html_string being a formatted answer. """ _ = gettext_set_language(ln) if f or (CFG_WEBSEARCH_COLLECTION_NAMES_SEARCH < 0) or \ (CFG_WEBSEARCH_COLLECTION_NAMES_SEARCH == 0 and cc != CFG_SITE_NAME): return (0, '') words = [ stem(unit[1].lower(), CFG_SITE_LANG) for unit in search_units if unit[2] == '' ] if not words: return (0, '') cache = self.get_data_cache() # TODO: If all categories of a submission match, display only submission (not categories) matching_submissions = {} for word in words: # Look for submission names if CFG_CERN_SITE and word == 'cern': # This keyword is useless here... continue submissions = cache.get(word, []) for doctype, submission_label, category in submissions: if acc_authorize_action(req, 'submit', \ authorized_if_no_roles=not isGuestUser(user_info['uid']), \ doctype=(CFG_CERN_SITE and doctype.startswith('GENSBM#') and 'GENSBM') or doctype, categ=category)[0] != 0: # Not authorized to submit in this submission continue if not matching_submissions.has_key( (doctype, submission_label)): matching_submissions[(doctype, submission_label)] = 0 add_score = 1 if category != '*': # This is the submission category, consider that # words that are part of the submission name are # less important than others here: if not word.lower() in category.lower(): # word is only in submission name add_score = 0.5 else: add_score = 1.5 matching_submissions[(doctype, submission_label)] += add_score matching_submissions_sorted = sorted(matching_submissions.iteritems(), key=lambda (k, v): (v, k), reverse=True) if not matching_submissions_sorted: return (0, '') best_score = matching_submissions_sorted[0][1] max_score_difference = 1.9 matching_submissions_names = [(submission_label, \ CFG_SITE_URL + '/submit?doctype=' + doctype.split("#", 1)[0] + '&ln=' + ln + (CFG_CERN_SITE and doctype.startswith('GENSBM#') and '#' + doctype.split("#", 1)[-1] or '') ) \ for (doctype, submission_label), score in matching_submissions_sorted if score > best_score - max_score_difference] best_sbm_words = whitespace_re.split( matching_submissions_sorted[0][0][1]) score_bonus = (((_("Submit").lower() in words) or ("submit" in words)) or \ ((_("Revise").lower() in words) or ("revise" in words)) or \ ((_("Modify").lower() in words) or ("modify" in words))) and 40 or 0 relevance = min( 100, max(0, (score_bonus + (100 * float(best_score) / float(len(best_sbm_words) + len(words)))) - 10)) return (relevance, self.display_answer_helper(matching_submissions_names, ln))