Пример #1
0
def get_pdf_snippets(recID, patterns, user_info):
    """
    Extract text snippets around 'patterns' from the newest PDF file of 'recID'
    The search is case-insensitive.
    The snippets are meant to look like in the results of the popular search
    engine: using " ... " between snippets.
    For empty patterns it returns ""

    @param recID: record ID to consider
    @param patterns: list of patterns to retrieve
    @param nb_words_around: max number of words around the matched pattern
    @param max_snippets: max number of snippets to include
    @return: snippet
    """
    from invenio.bibdocfile import BibRecDocs, check_bibdoc_authorization

    text_path = ""
    text_path_courtesy = ""
    for bd in BibRecDocs(recID).list_bibdocs():
        # Show excluded fulltext in snippets on Inspire, otherwise depending on authorization
        if bd.get_text() and (CFG_INSPIRE_SITE or not check_bibdoc_authorization(user_info, bd.get_status())[0]):
            text_path = bd.get_text_path()
            text_path_courtesy = bd.get_status()
            if CFG_INSPIRE_SITE and not text_path_courtesy:
                # get courtesy from doctype, since docstatus was empty:
                text_path_courtesy = bd.get_type()
                if text_path_courtesy == 'INSPIRE-PUBLIC':
                    # but ignore 'INSPIRE-PUBLIC' doctype
                    text_path_courtesy = ''
            break # stop at the first good PDF textable file

    nb_chars = CFG_WEBSEARCH_FULLTEXT_SNIPPETS_CHARS.get('', 0)
    max_snippets = CFG_WEBSEARCH_FULLTEXT_SNIPPETS.get('', 0)
    if CFG_WEBSEARCH_FULLTEXT_SNIPPETS_CHARS.has_key(text_path_courtesy):
        nb_chars=CFG_WEBSEARCH_FULLTEXT_SNIPPETS_CHARS[text_path_courtesy]
    if CFG_WEBSEARCH_FULLTEXT_SNIPPETS.has_key(text_path_courtesy):
        max_snippets=CFG_WEBSEARCH_FULLTEXT_SNIPPETS[text_path_courtesy]

    if text_path and nb_chars and max_snippets:
        out = ''
        if CFG_WEBSEARCH_FULLTEXT_SNIPPETS_GENERATOR == 'native':
            out = get_text_snippets(text_path, patterns, nb_chars, max_snippets)
            if not out:
                # no hit, so check stemmed versions:
                from invenio.bibindex_engine_stemmer import stem
                stemmed_patterns = [stem(p, 'en') for p in patterns]
                out = get_text_snippets(text_path, stemmed_patterns, nb_chars, max_snippets)
        elif CFG_WEBSEARCH_FULLTEXT_SNIPPETS_GENERATOR == 'SOLR':
            out = solr_get_snippet(patterns, recID, nb_chars, max_snippets)

        if out:
            out_courtesy = ""
            if CFG_INSPIRE_SITE and text_path_courtesy:
                out_courtesy = '<strong>Snippets courtesy of ' + text_path_courtesy + '</strong><br>'
            return """<div class="snippetbox">%s%s</div>""" % (out_courtesy, out)
        else:
            return ""
    else:
        return ""
Пример #2
0
def get_pdf_snippets(recID, patterns, user_info):
    """
    Extract text snippets around 'patterns' from the newest PDF file of 'recID'
    The search is case-insensitive.
    The snippets are meant to look like in the results of the popular search
    engine: using " ... " between snippets.
    For empty patterns it returns ""

    @param recID: record ID to consider
    @param patterns: list of patterns to retrieve
    @param nb_words_around: max number of words around the matched pattern
    @param max_snippets: max number of snippets to include
    @return: snippet
    """
    from invenio.bibdocfile import BibRecDocs, check_bibdoc_authorization

    text_path = ""
    text_path_courtesy = ""
    for bd in BibRecDocs(recID).list_bibdocs():
        # Show excluded fulltext in snippets on Inspire, otherwise depending on authorization
        if bd.get_text() and (CFG_INSPIRE_SITE or not check_bibdoc_authorization(user_info, bd.get_status())[0]):
            text_path = bd.get_text_path()
            text_path_courtesy = bd.get_status()
            if CFG_INSPIRE_SITE and not text_path_courtesy:
                # get courtesy from doctype, since docstatus was empty:
                text_path_courtesy = bd.get_type()
                if text_path_courtesy == 'INSPIRE-PUBLIC':
                    # but ignore 'INSPIRE-PUBLIC' doctype
                    text_path_courtesy = ''
            break # stop at the first good PDF textable file

    nb_chars = CFG_WEBSEARCH_FULLTEXT_SNIPPETS_CHARS.get('', 0)
    max_snippets = CFG_WEBSEARCH_FULLTEXT_SNIPPETS_CHARS.get('', 0)
    if CFG_WEBSEARCH_FULLTEXT_SNIPPETS_CHARS.has_key(text_path_courtesy):
        nb_chars=CFG_WEBSEARCH_FULLTEXT_SNIPPETS_CHARS[text_path_courtesy]
    if CFG_WEBSEARCH_FULLTEXT_SNIPPETS.has_key(text_path_courtesy):
        max_snippets=CFG_WEBSEARCH_FULLTEXT_SNIPPETS[text_path_courtesy]

    if text_path and nb_chars and max_snippets:
        out = ''
        if CFG_WEBSEARCH_FULLTEXT_SNIPPETS_GENERATOR == 'native':
            out = get_text_snippets(text_path, patterns, nb_chars, max_snippets)
            if not out:
                # no hit, so check stemmed versions:
                from invenio.bibindex_engine_stemmer import stem
                stemmed_patterns = [stem(p, 'en') for p in patterns]
                out = get_text_snippets(text_path, stemmed_patterns, nb_chars, max_snippets)
        elif CFG_WEBSEARCH_FULLTEXT_SNIPPETS_GENERATOR == 'SOLR':
            out = solr_get_snippet(patterns, recID, nb_chars, max_snippets)

        if out:
            out_courtesy = ""
            if CFG_INSPIRE_SITE and text_path_courtesy:
                out_courtesy = '<strong>Snippets courtesy of ' + text_path_courtesy + '</strong><br>'
            return """<div class="snippetbox">%s%s</div>""" % (out_courtesy, out)
        else:
            return ""
    else:
        return ""
Пример #3
0
def apply_stemming(word, stemming_language):
    """Returns word after applying stemming (if stemming language is set).
       You can change your stemming language in database.

       @param word: word to be checked
       @type word: str
       @param stemming_language: abbreviation of language or None
       @type stemming_language: str
    """
    if stemming_language:
        word = stem(word, stemming_language)
    return word
def apply_stemming(word, stemming_language):
    """Returns word after applying stemming (if stemming language is set).
       You can change your stemming language in database.

       @param word: word to be checked
       @type word: str
       @param stemming_language: abbreviation of language or None
       @type stemming_language: str
    """
    if stemming_language:
        word = stem(word, stemming_language)
    return word
Пример #5
0
def get_pdf_snippets(recID,
                     patterns,
                     nb_words_around=CFG_WEBSEARCH_FULLTEXT_SNIPPETS_WORDS,
                     max_snippets=CFG_WEBSEARCH_FULLTEXT_SNIPPETS):
    """
    Extract text snippets around 'patterns' from the newest PDF file of 'recID'
    The search is case-insensitive.
    The snippets are meant to look like in the results of the popular search
    engine: using " ... " between snippets.
    For empty patterns it returns ""

    @param recID: record ID to consider
    @param patterns: list of patterns to retrieve
    @param nb_words_around: max number of words around the matched pattern
    @param max_snippets: max number of snippets to include
    @return: snippet
    """
    from invenio.bibdocfile import BibRecDocs

    text_path = ""
    text_path_courtesy = ""
    for bd in BibRecDocs(recID).list_bibdocs():
        if bd.get_text():
            text_path = bd.get_text_path()
            text_path_courtesy = bd.get_status()
            if CFG_INSPIRE_SITE and not text_path_courtesy:
                # get courtesy from doctype, since docstatus was empty:
                text_path_courtesy = bd.get_type()
                if text_path_courtesy == 'INSPIRE-PUBLIC':
                    # but ignore 'INSPIRE-PUBLIC' doctype
                    text_path_courtesy = ''
            break  # stop at the first good PDF textable file

    if text_path:
        out = get_text_snippets(text_path, patterns, nb_words_around,
                                max_snippets)
        if not out:
            # no hit, so check stemmed versions:
            from invenio.bibindex_engine_stemmer import stem
            stemmed_patterns = [stem(p, 'en') for p in patterns]
            out = get_text_snippets(text_path, stemmed_patterns,
                                    nb_words_around, max_snippets, False)
        if out:
            out_courtesy = ""
            if text_path_courtesy:
                out_courtesy = '<strong>Snippets courtesy of ' + text_path_courtesy + '</strong><br>'
            return """<div class="snippetbox">%s%s</div>""" % (out_courtesy,
                                                               out)
        else:
            return ""
    else:
        return ""
def apply_stemming_and_stopwords_and_length_check(word, stemming_language):
    """Return WORD after applying stemming and stopword and length checks.
       See the config file in order to influence these.
    """
    # now check against stopwords:
    if is_stopword(word):
        return ""
    # finally check the word length:
    if len(word) < CFG_BIBINDEX_MIN_WORD_LENGTH:
        return ""
    # stem word, when configured so:
    if stemming_language:
        word = stem(word, stemming_language)
    return word
Пример #7
0
def apply_stemming_and_stopwords_and_length_check(word, stemming_language):
    """Return WORD after applying stemming and stopword and length checks.
       See the config file in order to influence these.
    """
    # now check against stopwords:
    if is_stopword(word):
        return ""
    # finally check the word length:
    if len(word) < CFG_BIBINDEX_MIN_WORD_LENGTH:
        return ""
    # stem word, when configured so:
    if stemming_language:
        word = stem(word, stemming_language)
    return word
Пример #8
0
def get_pdf_snippets(recID, patterns,
                     nb_words_around=CFG_WEBSEARCH_FULLTEXT_SNIPPETS_WORDS,
                     max_snippets=CFG_WEBSEARCH_FULLTEXT_SNIPPETS):
    """
    Extract text snippets around 'patterns' from the newest PDF file of 'recID'
    The search is case-insensitive.
    The snippets are meant to look like in the results of the popular search
    engine: using " ... " between snippets.
    For empty patterns it returns ""

    @param recID: record ID to consider
    @param patterns: list of patterns to retrieve
    @param nb_words_around: max number of words around the matched pattern
    @param max_snippets: max number of snippets to include
    @return: snippet
    """
    from invenio.bibdocfile import BibRecDocs

    text_path = ""
    text_path_courtesy = ""
    for bd in BibRecDocs(recID).list_bibdocs():
        if bd.get_text():
            text_path = bd.get_text_path()
            text_path_courtesy = bd.get_status()
            if CFG_INSPIRE_SITE and not text_path_courtesy:
                # get courtesy from doctype, since docstatus was empty:
                text_path_courtesy = bd.get_type()
                if text_path_courtesy == 'INSPIRE-PUBLIC':
                    # but ignore 'INSPIRE-PUBLIC' doctype
                    text_path_courtesy = ''
            break # stop at the first good PDF textable file

    if text_path:
        out = get_text_snippets(text_path, patterns, nb_words_around, max_snippets)
        if not out:
            # no hit, so check stemmed versions:
            from invenio.bibindex_engine_stemmer import stem
            stemmed_patterns = [stem(p, 'en') for p in patterns]
            out = get_text_snippets(text_path, stemmed_patterns,
                                    nb_words_around, max_snippets, False)
        if out:
            out_courtesy = ""
            if text_path_courtesy:
                out_courtesy = '<strong>Snippets courtesy of ' + text_path_courtesy + '</strong><br>'
            return """<div class="snippetbox">%s%s</div>""" % (out_courtesy, out)
        else:
            return ""
    else:
        return ""
Пример #9
0
    def answer(self, req, user_info, of, cc, colls_to_search, p, f,
               search_units, ln):
        """
        Answer question given by context, using knowledge base
        returned by L{get_kbname}.

        Return (relevance, html_string) where relevance is integer
        from 0 to 100 indicating how relevant to the question the
        answer is (see L{CFG_WEBSEARCH_SERVICE_MAX_SERVICE_ANSWER_RELEVANCE} for details) ,
        and html_string being a formatted answer.
        """
        _ = gettext_set_language(ln)
        # words = [stem(unit[1], ln) for unit in search_units if unit[2] == '']
        words = [
            stem(unit[1].lower(), CFG_SITE_LANG) for unit in search_units
            if unit[2] == ''
        ]
        cache = self.get_data_cache()

        matching_values = {}
        for word in words:
            res = cache.get(word, [])
            for keyword in res:
                if not matching_values.has_key(keyword):
                    matching_values[keyword] = 1
                else:
                    matching_values[keyword] += 1

        # order matching values per score
        matching_values_sorted = sorted(matching_values.iteritems(),
                                        key=lambda (k, v): (v, k),
                                        reverse=True)

        if not matching_values_sorted:
            return (0, '')

        best_score = matching_values_sorted[0][1]

        # Compute relevance. How many words from query did match
        relevance = min(
            100,
            max(0, (100 * float(best_score) /
                    len([word for word in words if len(word) > 3])) - 10))
        labels_and_links = [m.split("|", 1) for m in matching_values.keys()]
        translated_labels_and_links = [(_(label), url)
                                       for label, url in labels_and_links]

        return (relevance,
                self.display_answer_helper(translated_labels_and_links, ln))
Пример #10
0
def clean_and_split_words_and_stem(string, ln=CFG_SITE_LANG, stem_p=True):
    """
    Helper function to split and stemp words in a string.

    @param ln: language to consider for stemming
    @param stem_p: if True, also stem the word according to C{ln}
    @return: list of (stemmed) word.
    """
    alphanum_string = re_non_alphanum_only.sub(" ", string).lower()
    words = re_split_words_pattern.split(alphanum_string)
    if stem_p:
        # lowering must be done after stemming
        words = [stem(word, ln) for word in words]

    return words
Пример #11
0
def clean_and_split_words_and_stem(string, ln=CFG_SITE_LANG, stem_p=True):
    """
    Helper function to split and stemp words in a string.

    @param ln: language to consider for stemming
    @param stem_p: if True, also stem the word according to C{ln}
    @return: list of (stemmed) word.
    """
    alphanum_string = re_non_alphanum_only.sub(" ", string).lower()
    words = re_split_words_pattern.split(alphanum_string)
    if stem_p:
        # lowering must be done after stemming
        words = [stem(word, ln) for word in words]

    return words
Пример #12
0
 def test_stemmer_greek(self):
     """bibindex engine - Greek stemmer"""
     greek_test_cases = [['πληροφορίες', 'ΠΛΗΡΟΦΟΡΙ'],
                         ['πείραμα', 'ΠΕΙΡΑΜ'], ['πειράματα', 'ΠΕΙΡΑΜ'],
                         ['πειραματιστής', 'ΠΕΙΡΑΜΑΤΙΣΤ'],
                         ['πειραματίζομαι', 'ΠΕΙΡΑΜΑΤΙΖ'],
                         ['πειραματίζεσαι', 'ΠΕΙΡΑΜΑΤΙΖ'],
                         ['πειραματίστηκα', 'ΠΕΙΡΑΜΑΤΙΣΤ'],
                         ['πειραματόζωο', 'ΠΕΙΡΑΜΑΤΟΖΩ'], ['ζώο', 'ΖΩ'],
                         ['πειραματισμός', 'ΠΕΙΡΑΜΑΤΙΣΜ'],
                         ['πειραματικός', 'ΠΕΙΡΑΜΑΤΙΚ'],
                         ['πειραματικά', 'ΠΕΙΡΑΜΑΤ'],
                         ['ηλεκτρόνιο', 'ΗΛΕΚΤΡΟΝΙ'],
                         ['ηλεκτρονιακός', 'ΗΛΕΚΤΡΟΝΙΑΚ'],
                         ['ακτίνα', 'ΑΚΤΙΝ'], ['ακτινοβολία', 'ΑΚΤΙΝΟΒΟΛ'],
                         ['E=mc^2', 'E=MC^2'], ['α+β=γ', 'Α+Β=Γ']]
     for test_word, expected_result in greek_test_cases:
         self.assertEqual(expected_result,
                          bibindex_engine_stemmer.stem(test_word, "el"))
 def prepare_data_cache(self):
     """
     "Index" collection names
     """
     from invenio.search_engine import collection_i18nname_cache
     cache = {}
     words_and_coll = [(' '.join([' '.join([stem(word.lower(), ln) for word in \
                                            whitespace_re.split(non_alphanum_chars_only_re.sub(' ', translation))]) for ln, translation in \
                                  translations.iteritems()]), coll_name) for coll_name, translations in \
                       collection_i18nname_cache.cache.iteritems()]
     for words, coll in words_and_coll:
         for word in whitespace_re.split(words):
             if not word.strip():
                 continue
             if not cache.has_key(word):
                 cache[word] = []
             if not coll in cache[word]:
                 cache[word].append(coll)
     return cache
 def prepare_data_cache(self):
     """
     "Index" collection names
     """
     from invenio.search_engine import collection_i18nname_cache
     cache = {}
     words_and_coll = [(' '.join([' '.join([stem(word.lower(), ln) for word in \
                                            whitespace_re.split(non_alphanum_chars_only_re.sub(' ', translation))]) for ln, translation in \
                                  translations.iteritems()]), coll_name) for coll_name, translations in \
                       collection_i18nname_cache.cache.iteritems()]
     for words, coll in words_and_coll:
         for word in whitespace_re.split(words):
             if not word.strip():
                 continue
             if not cache.has_key(word):
                 cache[word] = []
             if not coll in cache[word]:
                 cache[word].append(coll)
     return cache
Пример #15
0
def get_pdf_snippets(recID,
                     patterns,
                     nb_words_around=CFG_WEBSEARCH_FULLTEXT_SNIPPETS_WORDS,
                     max_snippets=CFG_WEBSEARCH_FULLTEXT_SNIPPETS):
    """
    Extract text snippets around 'patterns' from the newest PDF file of 'recID'
    The search is case-insensitive.
    The snippets are meant to look like in the results of the popular search
    engine: using " ... " between snippets.
    For empty patterns it returns ""
    """
    from invenio.bibdocfile import BibRecDocs

    text_path = ""
    text_path_courtesy = ""
    for bd in BibRecDocs(recID).list_bibdocs():
        if bd.get_text():
            text_path = bd.get_text_path()
            text_path_courtesy = bd.get_status()
            if not text_path_courtesy and CFG_INSPIRE_SITE:
                text_path_courtesy = bd.get_type()
            break  # stop at the first good PDF textable file

    if text_path:
        out = get_text_snippets(text_path, patterns, nb_words_around,
                                max_snippets)
        if not out:
            # no hit, so check stemmed versions:
            from invenio.bibindex_engine_stemmer import stem
            stemmed_patterns = [stem(p, 'en') for p in patterns]
            out = get_text_snippets(text_path, stemmed_patterns,
                                    nb_words_around, max_snippets, False)
        if out:
            out_courtesy = ""
            if text_path_courtesy:
                out_courtesy = '<strong>Snippets courtesy of ' + text_path_courtesy + '</strong><br>'
            return """<div class="snippetbox">%s%s</div>""" % (out_courtesy,
                                                               out)
        else:
            return ""
    else:
        return ""
 def test_stemmer_english(self):
     """bibindex engine - English stemmer"""
     english_test_cases = [['information', 'inform'],
                           ['experiment', 'experi'],
                           ['experiments', 'experi'],
                           ['experimented', 'experi'],
                           ['experimenting', 'experi'],
                           ['experimental', 'experiment'],
                           ['experimentally', 'experiment'],
                           ['experimentation', 'experiment'],
                           ['experimentalism', 'experiment'],
                           ['experimenter', 'experiment'],
                           ['experimentalise', 'experimentalis'],
                           ['experimentalist', 'experimentalist'],
                           ['experimentalists', 'experimentalist'],
                           ['GeV', 'GeV'], ['$\Omega$', '$\Omega$'],
                           ['e^-', 'e^-'], ['C#', 'C#'], ['C++', 'C++']]
     for test_word, expected_result in english_test_cases:
         self.assertEqual(expected_result,
                          bibindex_engine_stemmer.stem(test_word, "en"))
Пример #17
0
def get_pdf_snippets(recID, patterns,
                     nb_words_around=CFG_WEBSEARCH_FULLTEXT_SNIPPETS_WORDS,
                     max_snippets=CFG_WEBSEARCH_FULLTEXT_SNIPPETS):
    """
    Extract text snippets around 'patterns' from the newest PDF file of 'recID'
    The search is case-insensitive.
    The snippets are meant to look like in the results of the popular search
    engine: using " ... " between snippets.
    For empty patterns it returns ""
    """
    from invenio.bibdocfile import BibRecDocs

    text_path = ""
    text_path_courtesy = ""
    for bd in BibRecDocs(recID).list_bibdocs():
        if bd.get_text():
            text_path = bd.get_text_path()
            text_path_courtesy = bd.get_status()
            if not text_path_courtesy and CFG_INSPIRE_SITE:
                text_path_courtesy = bd.get_type()
            break # stop at the first good PDF textable file

    if text_path:
        out = get_text_snippets(text_path, patterns, nb_words_around, max_snippets)
        if not out:
            # no hit, so check stemmed versions:
            from invenio.bibindex_engine_stemmer import stem
            stemmed_patterns = [stem(p, 'en') for p in patterns]
            out = get_text_snippets(text_path, stemmed_patterns,
                                    nb_words_around, max_snippets, False)
        if out:
            out_courtesy = ""
            if text_path_courtesy:
                out_courtesy = '<strong>Snippets courtesy of ' + text_path_courtesy + '</strong><br>'
            return """<div class="snippetbox">%s%s</div>""" % (out_courtesy, out)
        else:
            return ""
    else:
        return ""
Пример #18
0
    def answer(self, req, user_info, of, cc, colls_to_search, p, f, search_units, ln):
        """
        Answer question given by context, using knowledge base
        returned by L{get_kbname}.

        Return (relevance, html_string) where relevance is integer
        from 0 to 100 indicating how relevant to the question the
        answer is (see L{CFG_WEBSEARCH_SERVICE_MAX_SERVICE_ANSWER_RELEVANCE} for details) ,
        and html_string being a formatted answer.
        """
        _ = gettext_set_language(ln)
        # words = [stem(unit[1], ln) for unit in search_units if unit[2] == '']
        words = [stem(unit[1].lower(), CFG_SITE_LANG) for unit in search_units if unit[2] == '']
        cache = self.get_data_cache()

        matching_values = {}
        for word in words:
            res = cache.get(word, [])
            for keyword in res:
                if not matching_values.has_key(keyword):
                    matching_values[keyword] = 1
                else:
                    matching_values[keyword] += 1


        # order matching values per score
        matching_values_sorted = sorted(matching_values.iteritems(), key=lambda (k, v): (v, k), reverse=True)

        if not matching_values_sorted:
            return (0, '')

        best_score = matching_values_sorted[0][1]

        # Compute relevance. How many words from query did match
        relevance = min(100, max(0, (100 * float(best_score) / len([word for word in words if len(word) > 3 ])) - 10))
        labels_and_links = [m.split("|", 1) for m in matching_values.keys()]
        translated_labels_and_links = [(_(label), url) for label, url in labels_and_links]

        return (relevance, self.display_answer_helper(translated_labels_and_links, ln))
 def test_stemmer_english(self):
     """bibindex engine - English stemmer"""
     english_test_cases = [['information', 'inform'],
                           ['experiment', 'experi'],
                           ['experiments', 'experi'],
                           ['experimented', 'experi'],
                           ['experimenting', 'experi'],
                           ['experimental', 'experiment'],
                           ['experimentally', 'experiment'],
                           ['experimentation', 'experiment'],
                           ['experimentalism', 'experiment'],
                           ['experimenter', 'experiment'],
                           ['experimentalise', 'experimentalis'],
                           ['experimentalist', 'experimentalist'],
                           ['experimentalists', 'experimentalist'],
                           ['GeV', 'GeV'],
                           ['$\Omega$', '$\Omega$'],
                           ['e^-', 'e^-'],
                           ['C#', 'C#'],
                           ['C++', 'C++']]
     for test_word, expected_result in english_test_cases:
         self.assertEqual(expected_result,
                          bibindex_engine_stemmer.stem(test_word, "en"))
 def test_stemmer_greek(self):
     """bibindex engine - Greek stemmer"""
     greek_test_cases = [['πληροφορίες', 'ΠΛΗΡΟΦΟΡΙ'],
                         ['πείραμα', 'ΠΕΙΡΑΜ'],
                         ['πειράματα', 'ΠΕΙΡΑΜ'],
                         ['πειραματιστής', 'ΠΕΙΡΑΜΑΤΙΣΤ'],
                         ['πειραματίζομαι', 'ΠΕΙΡΑΜΑΤΙΖ'],
                         ['πειραματίζεσαι', 'ΠΕΙΡΑΜΑΤΙΖ'],
                         ['πειραματίστηκα', 'ΠΕΙΡΑΜΑΤΙΣΤ'],
                         ['πειραματόζωο', 'ΠΕΙΡΑΜΑΤΟΖΩ'],
                         ['ζώο', 'ΖΩ'],
                         ['πειραματισμός', 'ΠΕΙΡΑΜΑΤΙΣΜ'],
                         ['πειραματικός', 'ΠΕΙΡΑΜΑΤΙΚ'],
                         ['πειραματικά', 'ΠΕΙΡΑΜΑΤ'],
                         ['ηλεκτρόνιο', 'ΗΛΕΚΤΡΟΝΙ'],
                         ['ηλεκτρονιακός', 'ΗΛΕΚΤΡΟΝΙΑΚ'],
                         ['ακτίνα', 'ΑΚΤΙΝ'],
                         ['ακτινοβολία', 'ΑΚΤΙΝΟΒΟΛ'],
                         ['E=mc^2', 'E=MC^2'],
                         ['α+β=γ', 'Α+Β=Γ']]
     for test_word, expected_result in greek_test_cases:
         self.assertEqual(expected_result,
                          bibindex_engine_stemmer.stem(test_word, "el"))
Пример #21
0
def word_similarity(rank_method_code, lwords, hitset, rank_limit_relevance, verbose, methods):
    """Ranking a records containing specified words and returns a sorted list.
    input:
    rank_method_code - the code of the method, from the name field in rnkMETHOD
    lwords - a list of words from the query
    hitset - a list of hits for the query found by search_engine
    rank_limit_relevance - show only records with a rank value above this
    verbose - verbose value
    output:
    reclist - a list of sorted records: [[23,34], [344,24], [1,01]]
    prefix - what to show before the rank value
    postfix - what to show after the rank value
    voutput - contains extra information, content dependent on verbose value"""

    voutput = ""
    startCreate = time.time()

    if verbose > 0:
        voutput += (
            "<br />Running rank method: %s, using word_frequency function in bibrank_record_sorter<br />"
            % rank_method_code
        )

    lwords_old = lwords
    lwords = []
    # Check terms, remove non alphanumeric characters. Use both unstemmed and stemmed version of all terms.
    for i in range(0, len(lwords_old)):
        term = string.lower(lwords_old[i])
        if (
            not methods[rank_method_code]["stopwords"] == "True"
            or methods[rank_method_code]["stopwords"]
            and not is_stopword(term, 1)
        ):
            lwords.append((term, methods[rank_method_code]["rnkWORD_table"]))
            terms = string.split(
                string.lower(re.sub(methods[rank_method_code]["chars_alphanumericseparators"], " ", term))
            )
            for term in terms:
                if methods[rank_method_code].has_key("stemmer"):  # stem word
                    term = stem(string.replace(term, " ", ""), methods[rank_method_code]["stemmer"])
                if lwords_old[i] != term:  # add if stemmed word is different than original word
                    lwords.append((term, methods[rank_method_code]["rnkWORD_table"]))

    (recdict, rec_termcount, lrecIDs_remove) = ({}, {}, {})
    # For each term, if accepted, get a list of the records using the term
    # calculate then relevance for each term before sorting the list of records
    for (term, table) in lwords:
        term_recs = run_sql(
            """SELECT term, hitlist FROM %s WHERE term=%%s""" % methods[rank_method_code]["rnkWORD_table"], (term,)
        )
        if term_recs:  # if term exists in database, use for ranking
            term_recs = deserialize_via_marshal(term_recs[0][1])
            (recdict, rec_termcount) = calculate_record_relevance(
                (term, int(term_recs["Gi"][1])), term_recs, hitset, recdict, rec_termcount, verbose, quick=None
            )
            del term_recs

    if len(recdict) == 0 or (len(lwords) == 1 and lwords[0] == ""):
        return (
            None,
            "Records not ranked. The query is not detailed enough, or not enough records found, for ranking to be possible.",
            "",
            voutput,
        )
    else:  # sort if we got something to sort
        (reclist, hitset) = sort_record_relevance(recdict, rec_termcount, hitset, rank_limit_relevance, verbose)

    # Add any documents not ranked to the end of the list
    if hitset:
        lrecIDs = list(hitset)  # using 2-3mb
        reclist = zip(lrecIDs, [0] * len(lrecIDs)) + reclist  # using 6mb

    if verbose > 0:
        voutput += "<br />Current number of recIDs: %s<br />" % (methods[rank_method_code]["col_size"])
        voutput += (
            "Number of terms: %s<br />"
            % run_sql("SELECT count(id) FROM %s" % methods[rank_method_code]["rnkWORD_table"])[0][0]
        )
        voutput += "Terms: %s<br />" % lwords
        voutput += "Prepare and pre calculate time: %s<br />" % (str(time.time() - startCreate))
        voutput += "Total time used: %s<br />" % (str(time.time() - startCreate))
        voutput += str(reclist) + "<br />"
        rank_method_stat(rank_method_code, reclist, lwords)

    return (reclist, methods[rank_method_code]["prefix"], methods[rank_method_code]["postfix"], voutput)
 def test_stemmer_none(self):
     """bibindex engine - no stemmer"""
     self.assertEqual("information",
                      bibindex_engine_stemmer.stem("information", None))
    def answer(self, req, user_info, of, cc, colls_to_search, p, f,
               search_units, ln):
        """
        Answer question given by context.

        Return (relevance, html_string) where relevance is integer
        from 0 to 100 indicating how relevant to the question the
        answer is (see C{CFG_WEBSEARCH_SERVICE_MAX_SERVICE_ANSWER_RELEVANCE} for details) ,
        and html_string being a formatted answer.
        """
        from invenio.search_engine import \
             get_permitted_restricted_collections, \
             get_coll_i18nname, \
             collection_i18nname_cache, \
             collection_restricted_p
        _ = gettext_set_language(ln)
        # stem search units. remove those with field
        # TODO: search in hosted collection names too
        # TODO: ignore unattached trees
        # TODO: use synonyms
        if f or (CFG_WEBSEARCH_COLLECTION_NAMES_SEARCH < 0) or \
               (CFG_WEBSEARCH_COLLECTION_NAMES_SEARCH == 0 and cc != CFG_SITE_NAME):
            return (0, '')

        words = [
            stem(unit[1], ln) for unit in search_units
            if unit[2] in ('', 'collection')
        ]  # Stemming

        if not words:
            return (0, '')

        permitted_restricted_collections = get_permitted_restricted_collections(
            user_info)
        cache = self.get_data_cache()

        matching_collections = {}
        for word in words:
            if CFG_CERN_SITE and word == 'cern':
                # This keyword is useless here...
                continue

            colls = cache.get(word.lower(), [])
            for coll in colls:
                if collection_restricted_p(coll) and \
                       not coll in permitted_restricted_collections:
                    # Skip restricted collection user do not have access
                    continue
                if not matching_collections.has_key(coll):
                    matching_collections[coll] = 0
                matching_collections[coll] += 1

        matching_collections_sorted = sorted(matching_collections.iteritems(),
                                             key=lambda (k, v): (v, k),
                                             reverse=True)
        if not matching_collections_sorted:
            return (0, '')

        matching_collections_names = [(get_coll_i18nname(coll, ln, False), CFG_SITE_URL + '/collection/' + urllib.quote(coll, safe='') + '?ln=en') \
                                      for coll, score in matching_collections_sorted]

        best_score = matching_collections_sorted[0][1]
        best_coll_words = whitespace_re.split(
            matching_collections_sorted[0][0])

        relevance = min(
            100,
            max(0, (100 * float(2 * best_score) /
                    float(len(best_coll_words) + len(words)) - 10)))

        if (('submit' in p.lower()) or (_('submit') in p.lower())) and \
               not (('submit' in best_coll_words) or (_('submit') in best_coll_words)):
            # User is probably looking for a submission. Decrease relevance
            relevance = max(0, relevance - 30)

        return (relevance,
                self.display_answer_helper(matching_collections_names, ln))
 def test_stemmer_none(self):
     """bibindex engine - no stemmer"""
     self.assertEqual("information",
                      bibindex_engine_stemmer.stem("information", None))
    def answer(self, req, user_info, of, cc, colls_to_search, p, f, search_units, ln):
        """
        Answer question given by context.

        Return (relevance, html_string) where relevance is integer
        from 0 to 100 indicating how relevant to the question the
        answer is (see C{CFG_WEBSEARCH_SERVICE_MAX_SERVICE_ANSWER_RELEVANCE} for details) ,
        and html_string being a formatted answer.
        """
        _ = gettext_set_language(ln)
        if f or (CFG_WEBSEARCH_COLLECTION_NAMES_SEARCH < 0) or \
               (CFG_WEBSEARCH_COLLECTION_NAMES_SEARCH == 0 and cc != CFG_SITE_NAME):
            return (0, '')

        words = [stem(unit[1].lower(), CFG_SITE_LANG) for unit in search_units if unit[2] == '']

        if not words:
            return (0, '')

        cache = self.get_data_cache()

        # TODO: If all categories of a submission match, display only submission (not categories)

        matching_submissions = {}

        for word in words:
            # Look for submission names
            if CFG_CERN_SITE and word == 'cern':
                # This keyword is useless here...
                continue

            submissions = cache.get(word, [])
            for doctype, submission_label, category in submissions:
                if acc_authorize_action(req, 'submit', \
                                        authorized_if_no_roles=not isGuestUser(user_info['uid']), \
                                        doctype=(CFG_CERN_SITE and doctype.startswith('GENSBM#') and 'GENSBM') or doctype,
                                        categ=category)[0] != 0:
                    # Not authorized to submit in this submission
                    continue

                if not matching_submissions.has_key((doctype, submission_label)):
                    matching_submissions[(doctype, submission_label)] = 0
                add_score = 1
                if category != '*':
                    # This is the submission category, consider that
                    # words that are part of the submission name are
                    # less important than others here:
                    if not word.lower() in category.lower():
                        # word is only in submission name
                        add_score = 0.5
                    else:
                        add_score = 1.5

                matching_submissions[(doctype, submission_label)] += add_score

        matching_submissions_sorted = sorted(matching_submissions.iteritems(), key=lambda (k, v): (v, k), reverse=True)
        if not matching_submissions_sorted:
            return (0, '')
        best_score = matching_submissions_sorted[0][1]
        max_score_difference = 1.9

        matching_submissions_names = [(submission_label, \
                                       CFG_SITE_URL + '/submit?doctype=' + doctype.split("#", 1)[0] + '&ln=' + ln + (CFG_CERN_SITE and doctype.startswith('GENSBM#') and '#' + doctype.split("#", 1)[-1] or '') ) \
                                      for (doctype, submission_label), score in matching_submissions_sorted if score > best_score - max_score_difference]

        best_sbm_words = whitespace_re.split(matching_submissions_sorted[0][0][1])

        score_bonus = (((_("Submit").lower() in words) or ("submit" in words)) or \
                       ((_("Revise").lower() in words) or ("revise" in words)) or \
                       ((_("Modify").lower() in words) or ("modify" in words))) and 40 or 0
        relevance = min(100, max(0,  (score_bonus + (100 * float(best_score)  /  float(len(best_sbm_words) + len(words)))) - 10))

        return (relevance, self.display_answer_helper(matching_submissions_names, ln))
def word_similarity(rank_method_code, lwords, hitset, rank_limit_relevance, verbose, methods):
    """Ranking a records containing specified words and returns a sorted list.
    input:
    rank_method_code - the code of the method, from the name field in rnkMETHOD
    lwords - a list of words from the query
    hitset - a list of hits for the query found by search_engine
    rank_limit_relevance - show only records with a rank value above this
    verbose - verbose value
    output:
    reclist - a list of sorted records: [[23,34], [344,24], [1,01]]
    prefix - what to show before the rank value
    postfix - what to show after the rank value
    voutput - contains extra information, content dependent on verbose value"""

    voutput = ""
    startCreate = time.time()

    if verbose > 0:
        voutput += "<br />Running rank method: %s, using word_frequency function in bibrank_record_sorter<br />" % rank_method_code

    lwords_old = lwords
    lwords = []
    #Check terms, remove non alphanumeric characters. Use both unstemmed and stemmed version of all terms.
    for i in range(0, len(lwords_old)):
        term = string.lower(lwords_old[i])
        if not methods[rank_method_code]["stopwords"] == "True" or methods[rank_method_code]["stopwords"] and not is_stopword(term):
            lwords.append((term, methods[rank_method_code]["rnkWORD_table"]))
            terms = string.split(string.lower(re.sub(methods[rank_method_code]["chars_alphanumericseparators"], ' ', term)))
            for term in terms:
                if methods[rank_method_code].has_key("stemmer"): # stem word
                    term = stem(string.replace(term, ' ', ''), methods[rank_method_code]["stemmer"])
                if lwords_old[i] != term: #add if stemmed word is different than original word
                    lwords.append((term, methods[rank_method_code]["rnkWORD_table"]))

    (recdict, rec_termcount, lrecIDs_remove) = ({}, {}, {})
    #For each term, if accepted, get a list of the records using the term
    #calculate then relevance for each term before sorting the list of records
    for (term, table) in lwords:
        term_recs = run_sql("""SELECT term, hitlist FROM %s WHERE term=%%s""" % methods[rank_method_code]["rnkWORD_table"], (term,))
        if term_recs: #if term exists in database, use for ranking
            term_recs = deserialize_via_marshal(term_recs[0][1])
            (recdict, rec_termcount) = calculate_record_relevance((term, int(term_recs["Gi"][1])) , term_recs, hitset, recdict, rec_termcount, verbose, quick=None)
            del term_recs

    if len(recdict) == 0 or (len(lwords) == 1 and lwords[0] == ""):
        return (None, "Records not ranked. The query is not detailed enough, or not enough records found, for ranking to be possible.", "", voutput)
    else: #sort if we got something to sort
        (reclist, hitset) = sort_record_relevance(recdict, rec_termcount, hitset, rank_limit_relevance, verbose)

    #Add any documents not ranked to the end of the list
    if hitset:
        lrecIDs = list(hitset)                       #using 2-3mb
        reclist = zip(lrecIDs, [0] * len(lrecIDs)) + reclist      #using 6mb

    if verbose > 0:
        voutput += "<br />Current number of recIDs: %s<br />" % (methods[rank_method_code]["col_size"])
        voutput += "Number of terms: %s<br />" % run_sql("SELECT count(id) FROM %s" % methods[rank_method_code]["rnkWORD_table"])[0][0]
        voutput += "Terms: %s<br />" % lwords
        voutput += "Prepare and pre calculate time: %s<br />" % (str(time.time() - startCreate))
        voutput += "Total time used: %s<br />" % (str(time.time() - startCreate))
        voutput += str(reclist) + "<br />"
        rank_method_stat(rank_method_code, reclist, lwords)

    return (reclist, methods[rank_method_code]["prefix"], methods[rank_method_code]["postfix"], voutput)
    def answer(self, req, user_info, of, cc, colls_to_search, p, f, search_units, ln):
        """
        Answer question given by context.

        Return (relevance, html_string) where relevance is integer
        from 0 to 100 indicating how relevant to the question the
        answer is (see C{CFG_WEBSEARCH_SERVICE_MAX_SERVICE_ANSWER_RELEVANCE} for details) ,
        and html_string being a formatted answer.
        """
        from invenio.search_engine import \
             get_permitted_restricted_collections, \
             get_coll_i18nname, \
             collection_i18nname_cache, \
             collection_restricted_p
        _ = gettext_set_language(ln)
        # stem search units. remove those with field
        # TODO: search in hosted collection names too
        # TODO: ignore unattached trees
        # TODO: use synonyms
        if f or (CFG_WEBSEARCH_COLLECTION_NAMES_SEARCH < 0) or \
               (CFG_WEBSEARCH_COLLECTION_NAMES_SEARCH == 0 and cc != CFG_SITE_NAME):
            return (0, '')

        words = [stem(unit[1], ln) for unit in search_units if unit[2] in ('', 'collection')] # Stemming

        if not words:
            return (0, '')

        permitted_restricted_collections = get_permitted_restricted_collections(user_info)
        cache = self.get_data_cache()

        matching_collections = {}
        for word in words:
            if CFG_CERN_SITE and word == 'cern':
                # This keyword is useless here...
                continue

            colls = cache.get(word.lower(), [])
            for coll in colls:
                if collection_restricted_p(coll) and \
                       not coll in permitted_restricted_collections:
                    # Skip restricted collection user do not have access
                    continue
                if not matching_collections.has_key(coll):
                    matching_collections[coll] = 0
                matching_collections[coll] += 1


        matching_collections_sorted = sorted(matching_collections.iteritems(), key=lambda (k, v): (v, k), reverse=True)
        if not matching_collections_sorted:
            return (0, '')

        matching_collections_names = [(get_coll_i18nname(coll, ln, False), CFG_SITE_URL + '/collection/' + urllib.quote(coll, safe='') + '?ln=en') \
                                      for coll, score in matching_collections_sorted]

        best_score = matching_collections_sorted[0][1]
        best_coll_words = whitespace_re.split(matching_collections_sorted[0][0])

        relevance = min(100, max(0, (100 * float(2 * best_score) /  float(len(best_coll_words) + len(words)) - 10)))

        if (('submit' in p.lower()) or (_('submit') in p.lower())) and \
               not (('submit' in best_coll_words) or (_('submit') in best_coll_words)):
            # User is probably looking for a submission. Decrease relevance
            relevance = max(0, relevance - 30)

        return (relevance, self.display_answer_helper(matching_collections_names, ln))
    def answer(self, req, user_info, of, cc, colls_to_search, p, f,
               search_units, ln):
        """
        Answer question given by context.

        Return (relevance, html_string) where relevance is integer
        from 0 to 100 indicating how relevant to the question the
        answer is (see C{CFG_WEBSEARCH_SERVICE_MAX_SERVICE_ANSWER_RELEVANCE} for details) ,
        and html_string being a formatted answer.
        """
        _ = gettext_set_language(ln)
        if f or (CFG_WEBSEARCH_COLLECTION_NAMES_SEARCH < 0) or \
               (CFG_WEBSEARCH_COLLECTION_NAMES_SEARCH == 0 and cc != CFG_SITE_NAME):
            return (0, '')

        words = [
            stem(unit[1].lower(), CFG_SITE_LANG) for unit in search_units
            if unit[2] == ''
        ]

        if not words:
            return (0, '')

        cache = self.get_data_cache()

        # TODO: If all categories of a submission match, display only submission (not categories)

        matching_submissions = {}

        for word in words:
            # Look for submission names
            if CFG_CERN_SITE and word == 'cern':
                # This keyword is useless here...
                continue

            submissions = cache.get(word, [])
            for doctype, submission_label, category in submissions:
                if acc_authorize_action(req, 'submit', \
                                        authorized_if_no_roles=not isGuestUser(user_info['uid']), \
                                        doctype=(CFG_CERN_SITE and doctype.startswith('GENSBM#') and 'GENSBM') or doctype,
                                        categ=category)[0] != 0:
                    # Not authorized to submit in this submission
                    continue

                if not matching_submissions.has_key(
                    (doctype, submission_label)):
                    matching_submissions[(doctype, submission_label)] = 0
                add_score = 1
                if category != '*':
                    # This is the submission category, consider that
                    # words that are part of the submission name are
                    # less important than others here:
                    if not word.lower() in category.lower():
                        # word is only in submission name
                        add_score = 0.5
                    else:
                        add_score = 1.5

                matching_submissions[(doctype, submission_label)] += add_score

        matching_submissions_sorted = sorted(matching_submissions.iteritems(),
                                             key=lambda (k, v): (v, k),
                                             reverse=True)
        if not matching_submissions_sorted:
            return (0, '')
        best_score = matching_submissions_sorted[0][1]
        max_score_difference = 1.9

        matching_submissions_names = [(submission_label, \
                                       CFG_SITE_URL + '/submit?doctype=' + doctype.split("#", 1)[0] + '&ln=' + ln + (CFG_CERN_SITE and doctype.startswith('GENSBM#') and '#' + doctype.split("#", 1)[-1] or '') ) \
                                      for (doctype, submission_label), score in matching_submissions_sorted if score > best_score - max_score_difference]

        best_sbm_words = whitespace_re.split(
            matching_submissions_sorted[0][0][1])

        score_bonus = (((_("Submit").lower() in words) or ("submit" in words)) or \
                       ((_("Revise").lower() in words) or ("revise" in words)) or \
                       ((_("Modify").lower() in words) or ("modify" in words))) and 40 or 0
        relevance = min(
            100,
            max(0, (score_bonus + (100 * float(best_score) /
                                   float(len(best_sbm_words) + len(words)))) -
                10))

        return (relevance,
                self.display_answer_helper(matching_submissions_names, ln))