Python IdxINDEX.get_from_field示例，invenio.modules.indexer.models.IdxINDEX.get_from_field Python示例

示例#1

0

显示文件

 def _index_id(self):
     return IdxINDEX.get_from_field('collection').id

示例#2

0

显示文件

文件： cache.py 项目： ffelsner/invenio

 def _index_id(self):
     return IdxINDEX.get_from_field('collection').id

示例#3

0

显示文件

def search_unit_in_idxpairs(p, f, m, wl=0):
    """Search for pair 'p' in idxPAIR table for field 'f' and return hitset."""
    from invenio.modules.indexer.tokenizers.BibIndexDefaultTokenizer import (
        BibIndexDefaultTokenizer)
    # flag for knowing if the query limit has been reached
    limit_reached = False
    # flag to know when it makes sense to try to do exact matching
    do_exact_search = True
    result_set = intbitset()
    # determine the idxPAIR table to read from
    index = IdxINDEX.get_from_field(f)
    if index is None:
        return intbitset()
    model = index.pairf
    column = model.term
    stemming_language = index.stemming_language
    pairs_tokenizer = BibIndexDefaultTokenizer(stemming_language)

    conditions = []

    if p.startswith("%") and p.endswith("%"):
        p = p[1:-1]
    original_pattern = p
    # we now use '*' as the truncation character
    p = p.replace('*', '%')
    # is it a span query?
    ps = p.split("->", 1)
    if len(ps) == 2 and not (ps[0].endswith(' ') or ps[1].startswith(' ')):
        # so we are dealing with a span query
        pairs_left = pairs_tokenizer.tokenize_for_pairs(ps[0])
        pairs_right = pairs_tokenizer.tokenize_for_pairs(ps[1])
        if not pairs_left or not pairs_right:
            # we are not actually dealing with pairs but with words
            return search_unit_in_bibwords(original_pattern, f, wl=wl)
        elif len(pairs_left) != len(pairs_right):
            # it is kind of hard to know what the user actually wanted
            # we have to do: foo bar baz -> qux xyz, so let's swith to phrase
            return search_unit_in_idxphrases(original_pattern, f, m, wl)
        elif len(pairs_left) > 1 and \
                len(pairs_right) > 1 and \
                pairs_left[:-1] != pairs_right[:-1]:
            # again we have something like: foo bar baz -> abc xyz qux
            # so we'd better switch to phrase
            return search_unit_in_idxphrases(original_pattern, f, m, wl)
        else:
            # finally, we can treat the search using idxPairs
            # at this step we have either: foo bar -> abc xyz
            # or foo bar abc -> foo bar xyz
            conditions.append((column.between(pairs_left[-1],
                                              pairs_right[-1]), True))
            # which should be equal with pairs_right[:-1]
            for pair in pairs_left[:-1]:
                conditions.append((column == pair, False))
        do_exact_search = False  # no exact search for span queries
    elif p.find('%') > -1:
        # tokenizing p will remove the '%', so we have to make sure it stays
        replacement = 'xxxxxxxxxx'
        # hopefuly this will not clash with anything in the future
        p = p.replace('%', replacement)
        pairs = pairs_tokenizer.tokenize_for_pairs(p)
        if not pairs:
            # we are not actually dealing with pairs but with words
            return search_unit_in_bibwords(original_pattern, f, wl=wl)
        for pair in pairs:
            if pair.find(replacement) > -1:
                # we replace back the % sign
                pair = pair.replace(replacement, '%')
                conditions.append((column.like(pair), True))
            else:
                conditions.append((column == pair, False))
        do_exact_search = False
    else:
        # normal query
        pairs = pairs_tokenizer.tokenize_for_pairs(p)
        if not pairs:
            # we are not actually dealing with pairs but with words
            return search_unit_in_bibwords(original_pattern, f, wl=wl)
        for pair in pairs:
            conditions.append((column == pair, False))

    for condition, use_query_limit in conditions:
        query = model.query.filter(condition)
        if use_query_limit and wl > 0:
            query = query.limit(wl)
        res = query.values(model.term, model.hitlist)
        limit_reached |= use_query_limit and wl > 0 and len(res) == wl
        if not res:
            return intbitset()
        for pair, hitlist in res:
            hitset_idxpairs = intbitset(hitlist)
            if result_set is None:
                result_set = hitset_idxpairs
            else:
                result_set.intersection_update(hitset_idxpairs)
    # check to see if the query limit was reached
    if limit_reached:
        # raise an exception, so we can print a nice message to the user
        raise InvenioWebSearchWildcardLimitError(result_set)

    # check if we need to eliminate the false positives
    if cfg['CFG_WEBSEARCH_IDXPAIRS_EXACT_SEARCH'] and do_exact_search:
        # we need to eliminate the false positives
        model = IdxINDEX.idxPHRASER(f)
        not_exact_search = intbitset()
        for recid in result_set:
            res = model.query.filter(model.id_bibrec == recid).value(
                model.termlist)
            if res:
                termlist = deserialize_via_marshal(res)
                if not [
                        term for term in termlist
                        if term.lower().find(p.lower()) > -1
                ]:
                    not_exact_search.add(recid)
            else:
                not_exact_search.add(recid)
        # remove the recs that are false positives from the final result
        result_set.difference_update(not_exact_search)
    return result_set or intbitset()

示例#4

0

显示文件

def search_unit_in_bibwords(word, f, decompress=zlib.decompress, wl=0):
    """Search for 'word' inside bibwordsX table for field 'f'.

    :return: hitset of recIDs.
    """
    from invenio.legacy.bibindex.engine_stemmer import stem
    from invenio.legacy.bibindex.engine_washer import (
        lower_index_term,
        wash_index_term,
    )
    # FIXME: Should not be used for journal field.
    hitset = intbitset()  # will hold output result set
    limit_reached = 0  # flag for knowing if the query limit has been reached

    # if no field is specified, search in the global index.
    f = f or 'anyfield'
    index = IdxINDEX.get_from_field(f)
    if index is None:
        return hitset
    model = index.wordf
    stemming_language = index.stemming_language

    # wash 'word' argument and run query:
    if f.endswith('count') and word.endswith('+'):
        # field count query of the form N+ so transform N+ to N->99999:
        word = word[:-1] + '->99999'
    word = word.replace('*', '%')  # we now use '*' as the truncation character
    words = word.split("->", 1)  # check for span query
    if len(words) == 2:
        word0 = re_word.sub('', words[0])
        word1 = re_word.sub('', words[1])
        if stemming_language:
            word0 = lower_index_term(word0)
            word1 = lower_index_term(word1)
            # We remove trailing truncation character before stemming
            if word0.endswith('%'):
                word0 = stem(word0[:-1], stemming_language) + '%'
            else:
                word0 = stem(word0, stemming_language)
            if word1.endswith('%'):
                word1 = stem(word1[:-1], stemming_language) + '%'
            else:
                word1 = stem(word1, stemming_language)

        word0_washed = wash_index_term(word0)
        word1_washed = wash_index_term(word1)
        if f.endswith('count'):
            # field count query; convert to integers in order
            # to have numerical behaviour for 'BETWEEN n1 AND n2' query
            try:
                word0_washed = int(word0_washed)
                word1_washed = int(word1_washed)
            except ValueError:
                pass
        query = model.query.filter(
            model.term.between(word0_washed, word1_washed))
        if wl > 0:
            query = query.limit(wl)
        res = query.values('term', 'hitlist')
        if wl > 0 and len(res) == wl:
            limit_reached = 1  # set the limit reached flag to true
    else:
        word = re_word.sub('', word)
        if stemming_language:
            word = lower_index_term(word)
            # We remove trailing truncation character before stemming
            if word.endswith('%'):
                word = stem(word[:-1], stemming_language) + '%'
            else:
                word = stem(word, stemming_language)
        if word.find('%') >= 0:  # do we have wildcard in the word?
            query = model.query.filter(model.term.like(wash_index_term(word)))
            if wl > 0:
                query.limit(wl)
            res = query.values('term', 'hitlist')
            # set the limit reached flag to true
            limit_reached = wl > 0 and len(res) == wl
        else:
            res = model.query.filter(model.term.like(
                wash_index_term(word))).values('term', 'hitlist')
    # fill the result set:
    for word, hitlist in res:
        # add the results:
        hitset |= intbitset(hitlist)
    # check to see if the query limit was reached
    if limit_reached:
        # raise an exception, so we can print a nice message to the user
        raise InvenioWebSearchWildcardLimitError(hitset)
    # okay, return result set:
    return hitset

示例#5

0

显示文件

文件： native.py 项目： SCOAP3/invenio

def search_unit_in_idxpairs(p, f, m, wl=0):
    """Search for pair 'p' inside idxPAIR table for field 'f' and return hitset."""
    from invenio.modules.indexer.tokenizers.BibIndexDefaultTokenizer import (
        BibIndexDefaultTokenizer
    )
    # flag for knowing if the query limit has been reached
    limit_reached = False
    # flag to know when it makes sense to try to do exact matching
    do_exact_search = True
    result_set = intbitset()
    # determine the idxPAIR table to read from
    index = IdxINDEX.get_from_field(f)
    if index is None:
        return intbitset()
    model = index.pairf
    column = model.term
    stemming_language = index.stemming_language
    pairs_tokenizer = BibIndexDefaultTokenizer(stemming_language)

    conditions = []

    if p.startswith("%") and p.endswith("%"):
        p = p[1:-1]
    original_pattern = p
    # we now use '*' as the truncation character
    p = p.replace('*', '%')
    # is it a span query?
    ps = p.split("->", 1)
    if len(ps) == 2 and not (ps[0].endswith(' ') or ps[1].startswith(' ')):
        # so we are dealing with a span query
        pairs_left = pairs_tokenizer.tokenize_for_pairs(ps[0])
        pairs_right = pairs_tokenizer.tokenize_for_pairs(ps[1])
        if not pairs_left or not pairs_right:
            # we are not actually dealing with pairs but with words
            return search_unit_in_bibwords(original_pattern, f, wl=wl)
        elif len(pairs_left) != len(pairs_right):
            # it is kind of hard to know what the user actually wanted
            # we have to do: foo bar baz -> qux xyz, so let's swith to phrase
            return search_unit_in_idxphrases(original_pattern, f, m, wl)
        elif len(pairs_left) > 1 and \
                len(pairs_right) > 1 and \
                pairs_left[:-1] != pairs_right[:-1]:
            # again we have something like: foo bar baz -> abc xyz qux
            # so we'd better switch to phrase
            return search_unit_in_idxphrases(original_pattern, f, m, wl)
        else:
            # finally, we can treat the search using idxPairs
            # at this step we have either: foo bar -> abc xyz
            # or foo bar abc -> foo bar xyz
            conditions.append(
                (column.between(pairs_left[-1], pairs_right[-1]), True)
            )
            # which should be equal with pairs_right[:-1]
            for pair in pairs_left[:-1]:
                conditions.append((column == pair, False))
        do_exact_search = False  # no exact search for span queries
    elif p.find('%') > -1:
        # tokenizing p will remove the '%', so we have to make sure it stays
        replacement = 'xxxxxxxxxx'
        # hopefuly this will not clash with anything in the future
        p = p.replace('%', replacement)
        pairs = pairs_tokenizer.tokenize_for_pairs(p)
        if not pairs:
            # we are not actually dealing with pairs but with words
            return search_unit_in_bibwords(original_pattern, f, wl=wl)
        for pair in pairs:
            if pair.find(replacement) > -1:
                # we replace back the % sign
                pair = pair.replace(replacement, '%')
                conditions.append((column.like(pair), True))
            else:
                conditions.append((column == pair, False))
        do_exact_search = False
    else:
        # normal query
        pairs = pairs_tokenizer.tokenize_for_pairs(p)
        if not pairs:
            # we are not actually dealing with pairs but with words
            return search_unit_in_bibwords(original_pattern, f, wl=wl)
        for pair in pairs:
            conditions.append((column == pair, False))

    for condition, use_query_limit in conditions:
        query = model.query.filter(condition)
        if use_query_limit and wl > 0:
            query = query.limit(wl)
        res = query.values(model.term, model.hitlist)
        limit_reached |= use_query_limit and wl > 0 and len(res) == wl
        if not res:
            return intbitset()
        for pair, hitlist in res:
            hitset_idxpairs = intbitset(hitlist)
            if result_set is None:
                result_set = hitset_idxpairs
            else:
                result_set.intersection_update(hitset_idxpairs)
    # check to see if the query limit was reached
    if limit_reached:
        # raise an exception, so we can print a nice message to the user
        raise InvenioWebSearchWildcardLimitError(result_set)

    # check if we need to eliminate the false positives
    if cfg['CFG_WEBSEARCH_IDXPAIRS_EXACT_SEARCH'] and do_exact_search:
        # we need to eliminate the false positives
        model = IdxINDEX.idxPHRASER(f)
        not_exact_search = intbitset()
        for recid in result_set:
            res = model.query.filter(model.id_bibrec == recid).value(
                model.termlist)
            if res:
                termlist = deserialize_via_marshal(res)
                if not [term for term in termlist
                        if term.lower().find(p.lower()) > -1]:
                    not_exact_search.add(recid)
            else:
                not_exact_search.add(recid)
        # remove the recs that are false positives from the final result
        result_set.difference_update(not_exact_search)
    return result_set or intbitset()

示例#6

0

显示文件

文件： native.py 项目： SCOAP3/invenio

def search_unit_in_bibwords(word, f, decompress=zlib.decompress, wl=0):
    """Search for 'word' inside bibwordsX table for field 'f'.

    :return: hitset of recIDs.
    """
    from invenio.legacy.bibindex.engine_stemmer import stem
    from invenio.legacy.bibindex.engine_washer import (
        lower_index_term,
        wash_index_term,
    )
    # FIXME: Should not be used for journal field.
    hitset = intbitset()  # will hold output result set
    limit_reached = 0  # flag for knowing if the query limit has been reached

    # if no field is specified, search in the global index.
    f = f or 'anyfield'
    index = IdxINDEX.get_from_field(f)
    if index is None:
        return hitset
    model = index.wordf
    stemming_language = index.stemming_language

    # wash 'word' argument and run query:
    if f.endswith('count') and word.endswith('+'):
        # field count query of the form N+ so transform N+ to N->99999:
        word = word[:-1] + '->99999'
    word = word.replace('*', '%')  # we now use '*' as the truncation character
    words = word.split("->", 1)  # check for span query
    if len(words) == 2:
        word0 = re_word.sub('', words[0])
        word1 = re_word.sub('', words[1])
        if stemming_language:
            word0 = lower_index_term(word0)
            word1 = lower_index_term(word1)
            # We remove trailing truncation character before stemming
            if word0.endswith('%'):
                word0 = stem(word0[:-1], stemming_language) + '%'
            else:
                word0 = stem(word0, stemming_language)
            if word1.endswith('%'):
                word1 = stem(word1[:-1], stemming_language) + '%'
            else:
                word1 = stem(word1, stemming_language)

        word0_washed = wash_index_term(word0)
        word1_washed = wash_index_term(word1)
        if f.endswith('count'):
            # field count query; convert to integers in order
            # to have numerical behaviour for 'BETWEEN n1 AND n2' query
            try:
                word0_washed = int(word0_washed)
                word1_washed = int(word1_washed)
            except ValueError:
                pass
        query = model.query.filter(
            model.term.between(word0_washed, word1_washed)
        )
        if wl > 0:
            query = query.limit(wl)
        res = query.values('term', 'hitlist')
        if wl > 0 and len(res) == wl:
            limit_reached = 1  # set the limit reached flag to true
    else:
        word = re_word.sub('', word)
        if stemming_language:
            word = lower_index_term(word)
            # We remove trailing truncation character before stemming
            if word.endswith('%'):
                word = stem(word[:-1], stemming_language) + '%'
            else:
                word = stem(word, stemming_language)
        if word.find('%') >= 0:  # do we have wildcard in the word?
            query = model.query.filter(model.term.like(wash_index_term(word)))
            if wl > 0:
                query.limit(wl)
            res = query.values('term', 'hitlist')
            # set the limit reached flag to true
            limit_reached = wl > 0 and len(res) == wl
        else:
            res = model.query.filter(
                model.term.like(wash_index_term(word))
            ).values('term', 'hitlist')
    # fill the result set:
    for word, hitlist in res:
        # add the results:
        hitset |= intbitset(hitlist)
    # check to see if the query limit was reached
    if limit_reached:
        # raise an exception, so we can print a nice message to the user
        raise InvenioWebSearchWildcardLimitError(hitset)
    # okay, return result set:
    return hitset