def _index_id(self): return IdxINDEX.get_from_field('collection').id
def search_unit_in_idxpairs(p, f, m, wl=0): """Search for pair 'p' in idxPAIR table for field 'f' and return hitset.""" from invenio.modules.indexer.tokenizers.BibIndexDefaultTokenizer import ( BibIndexDefaultTokenizer) # flag for knowing if the query limit has been reached limit_reached = False # flag to know when it makes sense to try to do exact matching do_exact_search = True result_set = intbitset() # determine the idxPAIR table to read from index = IdxINDEX.get_from_field(f) if index is None: return intbitset() model = index.pairf column = model.term stemming_language = index.stemming_language pairs_tokenizer = BibIndexDefaultTokenizer(stemming_language) conditions = [] if p.startswith("%") and p.endswith("%"): p = p[1:-1] original_pattern = p # we now use '*' as the truncation character p = p.replace('*', '%') # is it a span query? ps = p.split("->", 1) if len(ps) == 2 and not (ps[0].endswith(' ') or ps[1].startswith(' ')): # so we are dealing with a span query pairs_left = pairs_tokenizer.tokenize_for_pairs(ps[0]) pairs_right = pairs_tokenizer.tokenize_for_pairs(ps[1]) if not pairs_left or not pairs_right: # we are not actually dealing with pairs but with words return search_unit_in_bibwords(original_pattern, f, wl=wl) elif len(pairs_left) != len(pairs_right): # it is kind of hard to know what the user actually wanted # we have to do: foo bar baz -> qux xyz, so let's swith to phrase return search_unit_in_idxphrases(original_pattern, f, m, wl) elif len(pairs_left) > 1 and \ len(pairs_right) > 1 and \ pairs_left[:-1] != pairs_right[:-1]: # again we have something like: foo bar baz -> abc xyz qux # so we'd better switch to phrase return search_unit_in_idxphrases(original_pattern, f, m, wl) else: # finally, we can treat the search using idxPairs # at this step we have either: foo bar -> abc xyz # or foo bar abc -> foo bar xyz conditions.append((column.between(pairs_left[-1], pairs_right[-1]), True)) # which should be equal with pairs_right[:-1] for pair in pairs_left[:-1]: conditions.append((column == pair, False)) do_exact_search = False # no exact search for span queries elif p.find('%') > -1: # tokenizing p will remove the '%', so we have to make sure it stays replacement = 'xxxxxxxxxx' # hopefuly this will not clash with anything in the future p = p.replace('%', replacement) pairs = pairs_tokenizer.tokenize_for_pairs(p) if not pairs: # we are not actually dealing with pairs but with words return search_unit_in_bibwords(original_pattern, f, wl=wl) for pair in pairs: if pair.find(replacement) > -1: # we replace back the % sign pair = pair.replace(replacement, '%') conditions.append((column.like(pair), True)) else: conditions.append((column == pair, False)) do_exact_search = False else: # normal query pairs = pairs_tokenizer.tokenize_for_pairs(p) if not pairs: # we are not actually dealing with pairs but with words return search_unit_in_bibwords(original_pattern, f, wl=wl) for pair in pairs: conditions.append((column == pair, False)) for condition, use_query_limit in conditions: query = model.query.filter(condition) if use_query_limit and wl > 0: query = query.limit(wl) res = query.values(model.term, model.hitlist) limit_reached |= use_query_limit and wl > 0 and len(res) == wl if not res: return intbitset() for pair, hitlist in res: hitset_idxpairs = intbitset(hitlist) if result_set is None: result_set = hitset_idxpairs else: result_set.intersection_update(hitset_idxpairs) # check to see if the query limit was reached if limit_reached: # raise an exception, so we can print a nice message to the user raise InvenioWebSearchWildcardLimitError(result_set) # check if we need to eliminate the false positives if cfg['CFG_WEBSEARCH_IDXPAIRS_EXACT_SEARCH'] and do_exact_search: # we need to eliminate the false positives model = IdxINDEX.idxPHRASER(f) not_exact_search = intbitset() for recid in result_set: res = model.query.filter(model.id_bibrec == recid).value( model.termlist) if res: termlist = deserialize_via_marshal(res) if not [ term for term in termlist if term.lower().find(p.lower()) > -1 ]: not_exact_search.add(recid) else: not_exact_search.add(recid) # remove the recs that are false positives from the final result result_set.difference_update(not_exact_search) return result_set or intbitset()
def search_unit_in_bibwords(word, f, decompress=zlib.decompress, wl=0): """Search for 'word' inside bibwordsX table for field 'f'. :return: hitset of recIDs. """ from invenio.legacy.bibindex.engine_stemmer import stem from invenio.legacy.bibindex.engine_washer import ( lower_index_term, wash_index_term, ) # FIXME: Should not be used for journal field. hitset = intbitset() # will hold output result set limit_reached = 0 # flag for knowing if the query limit has been reached # if no field is specified, search in the global index. f = f or 'anyfield' index = IdxINDEX.get_from_field(f) if index is None: return hitset model = index.wordf stemming_language = index.stemming_language # wash 'word' argument and run query: if f.endswith('count') and word.endswith('+'): # field count query of the form N+ so transform N+ to N->99999: word = word[:-1] + '->99999' word = word.replace('*', '%') # we now use '*' as the truncation character words = word.split("->", 1) # check for span query if len(words) == 2: word0 = re_word.sub('', words[0]) word1 = re_word.sub('', words[1]) if stemming_language: word0 = lower_index_term(word0) word1 = lower_index_term(word1) # We remove trailing truncation character before stemming if word0.endswith('%'): word0 = stem(word0[:-1], stemming_language) + '%' else: word0 = stem(word0, stemming_language) if word1.endswith('%'): word1 = stem(word1[:-1], stemming_language) + '%' else: word1 = stem(word1, stemming_language) word0_washed = wash_index_term(word0) word1_washed = wash_index_term(word1) if f.endswith('count'): # field count query; convert to integers in order # to have numerical behaviour for 'BETWEEN n1 AND n2' query try: word0_washed = int(word0_washed) word1_washed = int(word1_washed) except ValueError: pass query = model.query.filter( model.term.between(word0_washed, word1_washed)) if wl > 0: query = query.limit(wl) res = query.values('term', 'hitlist') if wl > 0 and len(res) == wl: limit_reached = 1 # set the limit reached flag to true else: word = re_word.sub('', word) if stemming_language: word = lower_index_term(word) # We remove trailing truncation character before stemming if word.endswith('%'): word = stem(word[:-1], stemming_language) + '%' else: word = stem(word, stemming_language) if word.find('%') >= 0: # do we have wildcard in the word? query = model.query.filter(model.term.like(wash_index_term(word))) if wl > 0: query.limit(wl) res = query.values('term', 'hitlist') # set the limit reached flag to true limit_reached = wl > 0 and len(res) == wl else: res = model.query.filter(model.term.like( wash_index_term(word))).values('term', 'hitlist') # fill the result set: for word, hitlist in res: # add the results: hitset |= intbitset(hitlist) # check to see if the query limit was reached if limit_reached: # raise an exception, so we can print a nice message to the user raise InvenioWebSearchWildcardLimitError(hitset) # okay, return result set: return hitset
def search_unit_in_idxpairs(p, f, m, wl=0): """Search for pair 'p' inside idxPAIR table for field 'f' and return hitset.""" from invenio.modules.indexer.tokenizers.BibIndexDefaultTokenizer import ( BibIndexDefaultTokenizer ) # flag for knowing if the query limit has been reached limit_reached = False # flag to know when it makes sense to try to do exact matching do_exact_search = True result_set = intbitset() # determine the idxPAIR table to read from index = IdxINDEX.get_from_field(f) if index is None: return intbitset() model = index.pairf column = model.term stemming_language = index.stemming_language pairs_tokenizer = BibIndexDefaultTokenizer(stemming_language) conditions = [] if p.startswith("%") and p.endswith("%"): p = p[1:-1] original_pattern = p # we now use '*' as the truncation character p = p.replace('*', '%') # is it a span query? ps = p.split("->", 1) if len(ps) == 2 and not (ps[0].endswith(' ') or ps[1].startswith(' ')): # so we are dealing with a span query pairs_left = pairs_tokenizer.tokenize_for_pairs(ps[0]) pairs_right = pairs_tokenizer.tokenize_for_pairs(ps[1]) if not pairs_left or not pairs_right: # we are not actually dealing with pairs but with words return search_unit_in_bibwords(original_pattern, f, wl=wl) elif len(pairs_left) != len(pairs_right): # it is kind of hard to know what the user actually wanted # we have to do: foo bar baz -> qux xyz, so let's swith to phrase return search_unit_in_idxphrases(original_pattern, f, m, wl) elif len(pairs_left) > 1 and \ len(pairs_right) > 1 and \ pairs_left[:-1] != pairs_right[:-1]: # again we have something like: foo bar baz -> abc xyz qux # so we'd better switch to phrase return search_unit_in_idxphrases(original_pattern, f, m, wl) else: # finally, we can treat the search using idxPairs # at this step we have either: foo bar -> abc xyz # or foo bar abc -> foo bar xyz conditions.append( (column.between(pairs_left[-1], pairs_right[-1]), True) ) # which should be equal with pairs_right[:-1] for pair in pairs_left[:-1]: conditions.append((column == pair, False)) do_exact_search = False # no exact search for span queries elif p.find('%') > -1: # tokenizing p will remove the '%', so we have to make sure it stays replacement = 'xxxxxxxxxx' # hopefuly this will not clash with anything in the future p = p.replace('%', replacement) pairs = pairs_tokenizer.tokenize_for_pairs(p) if not pairs: # we are not actually dealing with pairs but with words return search_unit_in_bibwords(original_pattern, f, wl=wl) for pair in pairs: if pair.find(replacement) > -1: # we replace back the % sign pair = pair.replace(replacement, '%') conditions.append((column.like(pair), True)) else: conditions.append((column == pair, False)) do_exact_search = False else: # normal query pairs = pairs_tokenizer.tokenize_for_pairs(p) if not pairs: # we are not actually dealing with pairs but with words return search_unit_in_bibwords(original_pattern, f, wl=wl) for pair in pairs: conditions.append((column == pair, False)) for condition, use_query_limit in conditions: query = model.query.filter(condition) if use_query_limit and wl > 0: query = query.limit(wl) res = query.values(model.term, model.hitlist) limit_reached |= use_query_limit and wl > 0 and len(res) == wl if not res: return intbitset() for pair, hitlist in res: hitset_idxpairs = intbitset(hitlist) if result_set is None: result_set = hitset_idxpairs else: result_set.intersection_update(hitset_idxpairs) # check to see if the query limit was reached if limit_reached: # raise an exception, so we can print a nice message to the user raise InvenioWebSearchWildcardLimitError(result_set) # check if we need to eliminate the false positives if cfg['CFG_WEBSEARCH_IDXPAIRS_EXACT_SEARCH'] and do_exact_search: # we need to eliminate the false positives model = IdxINDEX.idxPHRASER(f) not_exact_search = intbitset() for recid in result_set: res = model.query.filter(model.id_bibrec == recid).value( model.termlist) if res: termlist = deserialize_via_marshal(res) if not [term for term in termlist if term.lower().find(p.lower()) > -1]: not_exact_search.add(recid) else: not_exact_search.add(recid) # remove the recs that are false positives from the final result result_set.difference_update(not_exact_search) return result_set or intbitset()
def search_unit_in_bibwords(word, f, decompress=zlib.decompress, wl=0): """Search for 'word' inside bibwordsX table for field 'f'. :return: hitset of recIDs. """ from invenio.legacy.bibindex.engine_stemmer import stem from invenio.legacy.bibindex.engine_washer import ( lower_index_term, wash_index_term, ) # FIXME: Should not be used for journal field. hitset = intbitset() # will hold output result set limit_reached = 0 # flag for knowing if the query limit has been reached # if no field is specified, search in the global index. f = f or 'anyfield' index = IdxINDEX.get_from_field(f) if index is None: return hitset model = index.wordf stemming_language = index.stemming_language # wash 'word' argument and run query: if f.endswith('count') and word.endswith('+'): # field count query of the form N+ so transform N+ to N->99999: word = word[:-1] + '->99999' word = word.replace('*', '%') # we now use '*' as the truncation character words = word.split("->", 1) # check for span query if len(words) == 2: word0 = re_word.sub('', words[0]) word1 = re_word.sub('', words[1]) if stemming_language: word0 = lower_index_term(word0) word1 = lower_index_term(word1) # We remove trailing truncation character before stemming if word0.endswith('%'): word0 = stem(word0[:-1], stemming_language) + '%' else: word0 = stem(word0, stemming_language) if word1.endswith('%'): word1 = stem(word1[:-1], stemming_language) + '%' else: word1 = stem(word1, stemming_language) word0_washed = wash_index_term(word0) word1_washed = wash_index_term(word1) if f.endswith('count'): # field count query; convert to integers in order # to have numerical behaviour for 'BETWEEN n1 AND n2' query try: word0_washed = int(word0_washed) word1_washed = int(word1_washed) except ValueError: pass query = model.query.filter( model.term.between(word0_washed, word1_washed) ) if wl > 0: query = query.limit(wl) res = query.values('term', 'hitlist') if wl > 0 and len(res) == wl: limit_reached = 1 # set the limit reached flag to true else: word = re_word.sub('', word) if stemming_language: word = lower_index_term(word) # We remove trailing truncation character before stemming if word.endswith('%'): word = stem(word[:-1], stemming_language) + '%' else: word = stem(word, stemming_language) if word.find('%') >= 0: # do we have wildcard in the word? query = model.query.filter(model.term.like(wash_index_term(word))) if wl > 0: query.limit(wl) res = query.values('term', 'hitlist') # set the limit reached flag to true limit_reached = wl > 0 and len(res) == wl else: res = model.query.filter( model.term.like(wash_index_term(word)) ).values('term', 'hitlist') # fill the result set: for word, hitlist in res: # add the results: hitset |= intbitset(hitlist) # check to see if the query limit was reached if limit_reached: # raise an exception, so we can print a nice message to the user raise InvenioWebSearchWildcardLimitError(hitset) # okay, return result set: return hitset