def __init__(self, stemming_language=None, remove_stopwords=False, remove_html_markup=False, remove_latex_markup=False): BibIndexDefaultTokenizer.__init__(self, stemming_language, remove_stopwords, remove_html_markup, remove_latex_markup)
def __init__(self, stemming_language = None, remove_stopwords = False, remove_html_markup = False, remove_latex_markup = False): BibIndexDefaultTokenizer.__init__(self, stemming_language, remove_stopwords, remove_html_markup, remove_latex_markup) self.single_initial_re = re.compile('^\w\.$') self.split_on_re = re.compile('[\.\s-]') # lastname_stopwords describes terms which should not be used for indexing, # in multiple-word last names. These are purely conjunctions, serving the # same function as the American hyphen, but using linguistic constructs. self.lastname_stopwords = set(['y', 'of', 'and', 'de'])
def __init__(self, stemming_language=None, remove_stopwords=False, remove_html_markup=False, remove_latex_markup=False): BibIndexDefaultTokenizer.__init__(self, stemming_language, remove_stopwords, remove_html_markup, remove_latex_markup) self.single_initial_re = re.compile('^\w\.$') self.split_on_re = re.compile('[\.\s-]') # lastname_stopwords describes terms which should not be used for indexing, # in multiple-word last names. These are purely conjunctions, serving the # same function as the American hyphen, but using linguistic constructs. self.lastname_stopwords = set(['y', 'of', 'and', 'de'])
def __init__(self, stemming_language = None, remove_stopwords = False, remove_html_markup = False, remove_latex_markup = False): self.verbose = 3 BibIndexDefaultTokenizer.__init__(self, stemming_language, remove_stopwords, remove_html_markup, remove_latex_markup)
def search_unit_in_idxpairs(p, f, m, wl=0): """Search for pair 'p' in idxPAIR table for field 'f' and return hitset.""" from invenio.modules.indexer.tokenizers.BibIndexDefaultTokenizer import ( BibIndexDefaultTokenizer) # flag for knowing if the query limit has been reached limit_reached = False # flag to know when it makes sense to try to do exact matching do_exact_search = True result_set = intbitset() # determine the idxPAIR table to read from index = IdxINDEX.get_from_field(f) if index is None: return intbitset() model = index.pairf column = model.term stemming_language = index.stemming_language pairs_tokenizer = BibIndexDefaultTokenizer(stemming_language) conditions = [] if p.startswith("%") and p.endswith("%"): p = p[1:-1] original_pattern = p # we now use '*' as the truncation character p = p.replace('*', '%') # is it a span query? ps = p.split("->", 1) if len(ps) == 2 and not (ps[0].endswith(' ') or ps[1].startswith(' ')): # so we are dealing with a span query pairs_left = pairs_tokenizer.tokenize_for_pairs(ps[0]) pairs_right = pairs_tokenizer.tokenize_for_pairs(ps[1]) if not pairs_left or not pairs_right: # we are not actually dealing with pairs but with words return search_unit_in_bibwords(original_pattern, f, wl=wl) elif len(pairs_left) != len(pairs_right): # it is kind of hard to know what the user actually wanted # we have to do: foo bar baz -> qux xyz, so let's swith to phrase return search_unit_in_idxphrases(original_pattern, f, m, wl) elif len(pairs_left) > 1 and \ len(pairs_right) > 1 and \ pairs_left[:-1] != pairs_right[:-1]: # again we have something like: foo bar baz -> abc xyz qux # so we'd better switch to phrase return search_unit_in_idxphrases(original_pattern, f, m, wl) else: # finally, we can treat the search using idxPairs # at this step we have either: foo bar -> abc xyz # or foo bar abc -> foo bar xyz conditions.append((column.between(pairs_left[-1], pairs_right[-1]), True)) # which should be equal with pairs_right[:-1] for pair in pairs_left[:-1]: conditions.append((column == pair, False)) do_exact_search = False # no exact search for span queries elif p.find('%') > -1: # tokenizing p will remove the '%', so we have to make sure it stays replacement = 'xxxxxxxxxx' # hopefuly this will not clash with anything in the future p = p.replace('%', replacement) pairs = pairs_tokenizer.tokenize_for_pairs(p) if not pairs: # we are not actually dealing with pairs but with words return search_unit_in_bibwords(original_pattern, f, wl=wl) for pair in pairs: if pair.find(replacement) > -1: # we replace back the % sign pair = pair.replace(replacement, '%') conditions.append((column.like(pair), True)) else: conditions.append((column == pair, False)) do_exact_search = False else: # normal query pairs = pairs_tokenizer.tokenize_for_pairs(p) if not pairs: # we are not actually dealing with pairs but with words return search_unit_in_bibwords(original_pattern, f, wl=wl) for pair in pairs: conditions.append((column == pair, False)) for condition, use_query_limit in conditions: query = model.query.filter(condition) if use_query_limit and wl > 0: query = query.limit(wl) res = query.values(model.term, model.hitlist) limit_reached |= use_query_limit and wl > 0 and len(res) == wl if not res: return intbitset() for pair, hitlist in res: hitset_idxpairs = intbitset(hitlist) if result_set is None: result_set = hitset_idxpairs else: result_set.intersection_update(hitset_idxpairs) # check to see if the query limit was reached if limit_reached: # raise an exception, so we can print a nice message to the user raise InvenioWebSearchWildcardLimitError(result_set) # check if we need to eliminate the false positives if cfg['CFG_WEBSEARCH_IDXPAIRS_EXACT_SEARCH'] and do_exact_search: # we need to eliminate the false positives model = IdxINDEX.idxPHRASER(f) not_exact_search = intbitset() for recid in result_set: res = model.query.filter(model.id_bibrec == recid).value( model.termlist) if res: termlist = deserialize_via_marshal(res) if not [ term for term in termlist if term.lower().find(p.lower()) > -1 ]: not_exact_search.add(recid) else: not_exact_search.add(recid) # remove the recs that are false positives from the final result result_set.difference_update(not_exact_search) return result_set or intbitset()
def search_unit_in_idxpairs(p, f, m, wl=0): """Search for pair 'p' in idxPAIR table for field 'f' and return hitset.""" from invenio.modules.indexer.tokenizers.BibIndexDefaultTokenizer import BibIndexDefaultTokenizer # flag for knowing if the query limit has been reached limit_reached = False # flag to know when it makes sense to try to do exact matching do_exact_search = True result_set = intbitset() # determine the idxPAIR table to read from index = IdxINDEX.get_from_field(f) if index is None: return intbitset() model = index.pairf column = model.term stemming_language = index.stemming_language pairs_tokenizer = BibIndexDefaultTokenizer(stemming_language) conditions = [] if p.startswith("%") and p.endswith("%"): p = p[1:-1] original_pattern = p # we now use '*' as the truncation character p = p.replace("*", "%") # is it a span query? ps = p.split("->", 1) if len(ps) == 2 and not (ps[0].endswith(" ") or ps[1].startswith(" ")): # so we are dealing with a span query pairs_left = pairs_tokenizer.tokenize_for_pairs(ps[0]) pairs_right = pairs_tokenizer.tokenize_for_pairs(ps[1]) if not pairs_left or not pairs_right: # we are not actually dealing with pairs but with words return search_unit_in_bibwords(original_pattern, f, wl=wl) elif len(pairs_left) != len(pairs_right): # it is kind of hard to know what the user actually wanted # we have to do: foo bar baz -> qux xyz, so let's swith to phrase return search_unit_in_idxphrases(original_pattern, f, m, wl) elif len(pairs_left) > 1 and len(pairs_right) > 1 and pairs_left[:-1] != pairs_right[:-1]: # again we have something like: foo bar baz -> abc xyz qux # so we'd better switch to phrase return search_unit_in_idxphrases(original_pattern, f, m, wl) else: # finally, we can treat the search using idxPairs # at this step we have either: foo bar -> abc xyz # or foo bar abc -> foo bar xyz conditions.append((column.between(pairs_left[-1], pairs_right[-1]), True)) # which should be equal with pairs_right[:-1] for pair in pairs_left[:-1]: conditions.append((column == pair, False)) do_exact_search = False # no exact search for span queries elif p.find("%") > -1: # tokenizing p will remove the '%', so we have to make sure it stays replacement = "xxxxxxxxxx" # hopefuly this will not clash with anything in the future p = p.replace("%", replacement) pairs = pairs_tokenizer.tokenize_for_pairs(p) if not pairs: # we are not actually dealing with pairs but with words return search_unit_in_bibwords(original_pattern, f, wl=wl) for pair in pairs: if pair.find(replacement) > -1: # we replace back the % sign pair = pair.replace(replacement, "%") conditions.append((column.like(pair), True)) else: conditions.append((column == pair, False)) do_exact_search = False else: # normal query pairs = pairs_tokenizer.tokenize_for_pairs(p) if not pairs: # we are not actually dealing with pairs but with words return search_unit_in_bibwords(original_pattern, f, wl=wl) for pair in pairs: conditions.append((column == pair, False)) for condition, use_query_limit in conditions: query = model.query.filter(condition) if use_query_limit and wl > 0: query = query.limit(wl) res = query.values(model.term, model.hitlist) limit_reached |= use_query_limit and wl > 0 and len(res) == wl if not res: return intbitset() for pair, hitlist in res: hitset_idxpairs = intbitset(hitlist) if result_set is None: result_set = hitset_idxpairs else: result_set.intersection_update(hitset_idxpairs) # check to see if the query limit was reached if limit_reached: # raise an exception, so we can print a nice message to the user raise InvenioWebSearchWildcardLimitError(result_set) # check if we need to eliminate the false positives if cfg["CFG_WEBSEARCH_IDXPAIRS_EXACT_SEARCH"] and do_exact_search: # we need to eliminate the false positives model = IdxINDEX.idxPHRASER(f) not_exact_search = intbitset() for recid in result_set: res = model.query.filter(model.id_bibrec == recid).value(model.termlist) if res: termlist = deserialize_via_marshal(res) if not [term for term in termlist if term.lower().find(p.lower()) > -1]: not_exact_search.add(recid) else: not_exact_search.add(recid) # remove the recs that are false positives from the final result result_set.difference_update(not_exact_search) return result_set or intbitset()
def __init__(self, stemming_language = None, remove_stopwords = False, remove_html_markup = False, remove_latex_markup = False): """Initialisation""" BibIndexDefaultTokenizer.__init__(self, stemming_language, remove_stopwords, remove_html_markup, remove_latex_markup)