Python BibIndexDefaultTokenizer示例，invenio.modules.indexer.tokenizers.BibIndexDefaultTokenizer.BibIndexDefaultTokenizer Python示例

示例#1

0

显示文件

文件： BibIndexYearTokenizer.py 项目： chokribr/invenio-1

 def __init__(self,
              stemming_language=None,
              remove_stopwords=False,
              remove_html_markup=False,
              remove_latex_markup=False):
     BibIndexDefaultTokenizer.__init__(self, stemming_language,
                                       remove_stopwords, remove_html_markup,
                                       remove_latex_markup)

示例#2

0

显示文件

文件： BibIndexAuthorTokenizer.py 项目： SCOAP3/invenio

 def __init__(self, stemming_language = None, remove_stopwords = False, remove_html_markup = False, remove_latex_markup = False):
     BibIndexDefaultTokenizer.__init__(self, stemming_language,
                                             remove_stopwords,
                                             remove_html_markup,
                                             remove_latex_markup)
     self.single_initial_re = re.compile('^\w\.$')
     self.split_on_re = re.compile('[\.\s-]')
     # lastname_stopwords describes terms which should not be used for indexing,
     # in multiple-word last names.  These are purely conjunctions, serving the
     # same function as the American hyphen, but using linguistic constructs.
     self.lastname_stopwords = set(['y', 'of', 'and', 'de'])

示例#3

0

显示文件

 def __init__(self,
              stemming_language=None,
              remove_stopwords=False,
              remove_html_markup=False,
              remove_latex_markup=False):
     BibIndexDefaultTokenizer.__init__(self, stemming_language,
                                       remove_stopwords, remove_html_markup,
                                       remove_latex_markup)
     self.single_initial_re = re.compile('^\w\.$')
     self.split_on_re = re.compile('[\.\s-]')
     # lastname_stopwords describes terms which should not be used for indexing,
     # in multiple-word last names.  These are purely conjunctions, serving the
     # same function as the American hyphen, but using linguistic constructs.
     self.lastname_stopwords = set(['y', 'of', 'and', 'de'])

示例#4

0

显示文件

文件： BibIndexFulltextTokenizer.py 项目： mhellmic/b2share

 def __init__(self, stemming_language = None, remove_stopwords = False, remove_html_markup = False, remove_latex_markup = False):
     self.verbose = 3
     BibIndexDefaultTokenizer.__init__(self, stemming_language,
                                             remove_stopwords,
                                             remove_html_markup,
                                             remove_latex_markup)

示例#5

0

显示文件

文件： native.py 项目： osub3/invenio

def search_unit_in_idxpairs(p, f, m, wl=0):
    """Search for pair 'p' in idxPAIR table for field 'f' and return hitset."""
    from invenio.modules.indexer.tokenizers.BibIndexDefaultTokenizer import (
        BibIndexDefaultTokenizer)
    # flag for knowing if the query limit has been reached
    limit_reached = False
    # flag to know when it makes sense to try to do exact matching
    do_exact_search = True
    result_set = intbitset()
    # determine the idxPAIR table to read from
    index = IdxINDEX.get_from_field(f)
    if index is None:
        return intbitset()
    model = index.pairf
    column = model.term
    stemming_language = index.stemming_language
    pairs_tokenizer = BibIndexDefaultTokenizer(stemming_language)

    conditions = []

    if p.startswith("%") and p.endswith("%"):
        p = p[1:-1]
    original_pattern = p
    # we now use '*' as the truncation character
    p = p.replace('*', '%')
    # is it a span query?
    ps = p.split("->", 1)
    if len(ps) == 2 and not (ps[0].endswith(' ') or ps[1].startswith(' ')):
        # so we are dealing with a span query
        pairs_left = pairs_tokenizer.tokenize_for_pairs(ps[0])
        pairs_right = pairs_tokenizer.tokenize_for_pairs(ps[1])
        if not pairs_left or not pairs_right:
            # we are not actually dealing with pairs but with words
            return search_unit_in_bibwords(original_pattern, f, wl=wl)
        elif len(pairs_left) != len(pairs_right):
            # it is kind of hard to know what the user actually wanted
            # we have to do: foo bar baz -> qux xyz, so let's swith to phrase
            return search_unit_in_idxphrases(original_pattern, f, m, wl)
        elif len(pairs_left) > 1 and \
                len(pairs_right) > 1 and \
                pairs_left[:-1] != pairs_right[:-1]:
            # again we have something like: foo bar baz -> abc xyz qux
            # so we'd better switch to phrase
            return search_unit_in_idxphrases(original_pattern, f, m, wl)
        else:
            # finally, we can treat the search using idxPairs
            # at this step we have either: foo bar -> abc xyz
            # or foo bar abc -> foo bar xyz
            conditions.append((column.between(pairs_left[-1],
                                              pairs_right[-1]), True))
            # which should be equal with pairs_right[:-1]
            for pair in pairs_left[:-1]:
                conditions.append((column == pair, False))
        do_exact_search = False  # no exact search for span queries
    elif p.find('%') > -1:
        # tokenizing p will remove the '%', so we have to make sure it stays
        replacement = 'xxxxxxxxxx'
        # hopefuly this will not clash with anything in the future
        p = p.replace('%', replacement)
        pairs = pairs_tokenizer.tokenize_for_pairs(p)
        if not pairs:
            # we are not actually dealing with pairs but with words
            return search_unit_in_bibwords(original_pattern, f, wl=wl)
        for pair in pairs:
            if pair.find(replacement) > -1:
                # we replace back the % sign
                pair = pair.replace(replacement, '%')
                conditions.append((column.like(pair), True))
            else:
                conditions.append((column == pair, False))
        do_exact_search = False
    else:
        # normal query
        pairs = pairs_tokenizer.tokenize_for_pairs(p)
        if not pairs:
            # we are not actually dealing with pairs but with words
            return search_unit_in_bibwords(original_pattern, f, wl=wl)
        for pair in pairs:
            conditions.append((column == pair, False))

    for condition, use_query_limit in conditions:
        query = model.query.filter(condition)
        if use_query_limit and wl > 0:
            query = query.limit(wl)
        res = query.values(model.term, model.hitlist)
        limit_reached |= use_query_limit and wl > 0 and len(res) == wl
        if not res:
            return intbitset()
        for pair, hitlist in res:
            hitset_idxpairs = intbitset(hitlist)
            if result_set is None:
                result_set = hitset_idxpairs
            else:
                result_set.intersection_update(hitset_idxpairs)
    # check to see if the query limit was reached
    if limit_reached:
        # raise an exception, so we can print a nice message to the user
        raise InvenioWebSearchWildcardLimitError(result_set)

    # check if we need to eliminate the false positives
    if cfg['CFG_WEBSEARCH_IDXPAIRS_EXACT_SEARCH'] and do_exact_search:
        # we need to eliminate the false positives
        model = IdxINDEX.idxPHRASER(f)
        not_exact_search = intbitset()
        for recid in result_set:
            res = model.query.filter(model.id_bibrec == recid).value(
                model.termlist)
            if res:
                termlist = deserialize_via_marshal(res)
                if not [
                        term for term in termlist
                        if term.lower().find(p.lower()) > -1
                ]:
                    not_exact_search.add(recid)
            else:
                not_exact_search.add(recid)
        # remove the recs that are false positives from the final result
        result_set.difference_update(not_exact_search)
    return result_set or intbitset()

示例#6

0

显示文件

文件： native.py 项目： jiangmin9/invenio

def search_unit_in_idxpairs(p, f, m, wl=0):
    """Search for pair 'p' in idxPAIR table for field 'f' and return hitset."""
    from invenio.modules.indexer.tokenizers.BibIndexDefaultTokenizer import BibIndexDefaultTokenizer

    # flag for knowing if the query limit has been reached
    limit_reached = False
    # flag to know when it makes sense to try to do exact matching
    do_exact_search = True
    result_set = intbitset()
    # determine the idxPAIR table to read from
    index = IdxINDEX.get_from_field(f)
    if index is None:
        return intbitset()
    model = index.pairf
    column = model.term
    stemming_language = index.stemming_language
    pairs_tokenizer = BibIndexDefaultTokenizer(stemming_language)

    conditions = []

    if p.startswith("%") and p.endswith("%"):
        p = p[1:-1]
    original_pattern = p
    # we now use '*' as the truncation character
    p = p.replace("*", "%")
    # is it a span query?
    ps = p.split("->", 1)
    if len(ps) == 2 and not (ps[0].endswith(" ") or ps[1].startswith(" ")):
        # so we are dealing with a span query
        pairs_left = pairs_tokenizer.tokenize_for_pairs(ps[0])
        pairs_right = pairs_tokenizer.tokenize_for_pairs(ps[1])
        if not pairs_left or not pairs_right:
            # we are not actually dealing with pairs but with words
            return search_unit_in_bibwords(original_pattern, f, wl=wl)
        elif len(pairs_left) != len(pairs_right):
            # it is kind of hard to know what the user actually wanted
            # we have to do: foo bar baz -> qux xyz, so let's swith to phrase
            return search_unit_in_idxphrases(original_pattern, f, m, wl)
        elif len(pairs_left) > 1 and len(pairs_right) > 1 and pairs_left[:-1] != pairs_right[:-1]:
            # again we have something like: foo bar baz -> abc xyz qux
            # so we'd better switch to phrase
            return search_unit_in_idxphrases(original_pattern, f, m, wl)
        else:
            # finally, we can treat the search using idxPairs
            # at this step we have either: foo bar -> abc xyz
            # or foo bar abc -> foo bar xyz
            conditions.append((column.between(pairs_left[-1], pairs_right[-1]), True))
            # which should be equal with pairs_right[:-1]
            for pair in pairs_left[:-1]:
                conditions.append((column == pair, False))
        do_exact_search = False  # no exact search for span queries
    elif p.find("%") > -1:
        # tokenizing p will remove the '%', so we have to make sure it stays
        replacement = "xxxxxxxxxx"
        # hopefuly this will not clash with anything in the future
        p = p.replace("%", replacement)
        pairs = pairs_tokenizer.tokenize_for_pairs(p)
        if not pairs:
            # we are not actually dealing with pairs but with words
            return search_unit_in_bibwords(original_pattern, f, wl=wl)
        for pair in pairs:
            if pair.find(replacement) > -1:
                # we replace back the % sign
                pair = pair.replace(replacement, "%")
                conditions.append((column.like(pair), True))
            else:
                conditions.append((column == pair, False))
        do_exact_search = False
    else:
        # normal query
        pairs = pairs_tokenizer.tokenize_for_pairs(p)
        if not pairs:
            # we are not actually dealing with pairs but with words
            return search_unit_in_bibwords(original_pattern, f, wl=wl)
        for pair in pairs:
            conditions.append((column == pair, False))

    for condition, use_query_limit in conditions:
        query = model.query.filter(condition)
        if use_query_limit and wl > 0:
            query = query.limit(wl)
        res = query.values(model.term, model.hitlist)
        limit_reached |= use_query_limit and wl > 0 and len(res) == wl
        if not res:
            return intbitset()
        for pair, hitlist in res:
            hitset_idxpairs = intbitset(hitlist)
            if result_set is None:
                result_set = hitset_idxpairs
            else:
                result_set.intersection_update(hitset_idxpairs)
    # check to see if the query limit was reached
    if limit_reached:
        # raise an exception, so we can print a nice message to the user
        raise InvenioWebSearchWildcardLimitError(result_set)

    # check if we need to eliminate the false positives
    if cfg["CFG_WEBSEARCH_IDXPAIRS_EXACT_SEARCH"] and do_exact_search:
        # we need to eliminate the false positives
        model = IdxINDEX.idxPHRASER(f)
        not_exact_search = intbitset()
        for recid in result_set:
            res = model.query.filter(model.id_bibrec == recid).value(model.termlist)
            if res:
                termlist = deserialize_via_marshal(res)
                if not [term for term in termlist if term.lower().find(p.lower()) > -1]:
                    not_exact_search.add(recid)
            else:
                not_exact_search.add(recid)
        # remove the recs that are false positives from the final result
        result_set.difference_update(not_exact_search)
    return result_set or intbitset()

示例#7

0

显示文件

文件： BibIndexCJKTokenizer.py 项目： kasioumis/invenio

 def __init__(self, stemming_language = None, remove_stopwords = False, remove_html_markup = False, remove_latex_markup = False):
     """Initialisation"""
     BibIndexDefaultTokenizer.__init__(self, stemming_language,
                                             remove_stopwords,
                                             remove_html_markup,
                                             remove_latex_markup)