示例#1
0
 def _sort_alphanumerically_remove_leading_articles_strip_accents(self, val):
     """
     Convert:
     'The title' => 'title'
     'A title' => 'title'
     'Title' => 'title'
     """
     if not val:
         return ''
     val_tokens = str(val).split(" ", 1) #split in leading_word, phrase_without_leading_word
     if len(val_tokens) == 2 and val_tokens[0].lower() in LEADING_ARTICLES:
         return strip_accents(val_tokens[1].strip().lower())
     return strip_accents(val.lower())
示例#2
0
    def tokenize_for_words(self, phrase):
        """Return list of words found in PHRASE.  Note that the phrase is
           split into groups depending on the alphanumeric characters and
           punctuation characters definition present in the config file.
        """

        words = {}
        formulas = []
        if self.remove_html_markup and phrase.find("</") > -1:
            phrase = remove_html_markup(phrase)
        if self.remove_latex_markup:
            formulas = latex_formula_re.findall(phrase)
            phrase = remove_latex_markup(phrase)
            phrase = latex_formula_re.sub(' ', phrase)
        phrase = wash_for_utf8(phrase)
        phrase = lower_index_term(phrase)
        # 1st split phrase into blocks according to whitespace
        for block in strip_accents(phrase).split():
            # 2nd remove leading/trailing punctuation and add block:
            block = re_block_punctuation_begin.sub("", block)
            block = re_block_punctuation_end.sub("", block)
            if block:
                stemmed_block = remove_stopwords(block, self.remove_stopwords)
                stemmed_block = length_check(stemmed_block)
                stemmed_block = apply_stemming(stemmed_block,
                                               self.stemming_language)
                if stemmed_block:
                    words[stemmed_block] = 1
                if re_arxiv.match(block):
                    # special case for blocks like `arXiv:1007.5048' where
                    # we would like to index the part after the colon
                    # regardless of dot or other punctuation characters:
                    words[block.split(':', 1)[1]] = 1
                # 3rd break each block into subblocks according to punctuation and add subblocks:
                for subblock in re_punctuation.split(block):
                    stemmed_subblock = remove_stopwords(
                        subblock, self.remove_stopwords)
                    stemmed_subblock = length_check(stemmed_subblock)
                    stemmed_subblock = apply_stemming(stemmed_subblock,
                                                      self.stemming_language)
                    if stemmed_subblock:
                        words[stemmed_subblock] = 1
                    # 4th break each subblock into alphanumeric groups and add groups:
                    for alphanumeric_group in re_separators.split(subblock):
                        stemmed_alphanumeric_group = remove_stopwords(
                            alphanumeric_group, self.remove_stopwords)
                        stemmed_alphanumeric_group = length_check(
                            stemmed_alphanumeric_group)
                        stemmed_alphanumeric_group = apply_stemming(
                            stemmed_alphanumeric_group, self.stemming_language)
                        if stemmed_alphanumeric_group:
                            words[stemmed_alphanumeric_group] = 1
        for block in formulas:
            words[block] = 1
        return words.keys()
    def tokenize_for_words(self, phrase):
        """Return list of words found in PHRASE.  Note that the phrase is
           split into groups depending on the alphanumeric characters and
           punctuation characters definition present in the config file.
        """

        words = {}
        formulas = []
        if self.remove_html_markup and phrase.find("</") > -1:
            phrase = remove_html_markup(phrase)
        if self.remove_latex_markup:
            formulas = latex_formula_re.findall(phrase)
            phrase = remove_latex_markup(phrase)
            phrase = latex_formula_re.sub(' ', phrase)
        phrase = wash_for_utf8(phrase)
        phrase = lower_index_term(phrase)
        # 1st split phrase into blocks according to whitespace
        for block in strip_accents(phrase).split():
            # 2nd remove leading/trailing punctuation and add block:
            block = re_block_punctuation_begin.sub("", block)
            block = re_block_punctuation_end.sub("", block)
            if block:
                stemmed_block = remove_stopwords(block, self.remove_stopwords)
                stemmed_block = length_check(stemmed_block)
                stemmed_block = apply_stemming(stemmed_block, self.stemming_language)
                if stemmed_block:
                    words[stemmed_block] = 1
                if re_arxiv.match(block):
                    # special case for blocks like `arXiv:1007.5048' where
                    # we would like to index the part after the colon
                    # regardless of dot or other punctuation characters:
                    words[block.split(':', 1)[1]] = 1
                # 3rd break each block into subblocks according to punctuation and add subblocks:
                for subblock in re_punctuation.split(block):
                    stemmed_subblock = remove_stopwords(subblock, self.remove_stopwords)
                    stemmed_subblock = length_check(stemmed_subblock)
                    stemmed_subblock = apply_stemming(stemmed_subblock, self.stemming_language)
                    if stemmed_subblock:
                        words[stemmed_subblock] = 1
                    # 4th break each subblock into alphanumeric groups and add groups:
                    for alphanumeric_group in re_separators.split(subblock):
                        stemmed_alphanumeric_group = remove_stopwords(alphanumeric_group, self.remove_stopwords)
                        stemmed_alphanumeric_group = length_check(stemmed_alphanumeric_group)
                        stemmed_alphanumeric_group = apply_stemming(stemmed_alphanumeric_group, self.stemming_language)
                        if stemmed_alphanumeric_group:
                            words[stemmed_alphanumeric_group] = 1
        for block in formulas:
            words[block] = 1
        return words.keys()
示例#4
0
    def tokenize_for_pairs(self, phrase):
        """Return list of words found in PHRASE.  Note that the phrase is
           split into groups depending on the alphanumeric characters and
           punctuation characters definition present in the config file.
        """

        words = {}
        if self.remove_html_markup and phrase.find("</") > -1:
            phrase = remove_html_markup(phrase)
        if self.remove_latex_markup:
            phrase = remove_latex_markup(phrase)
            phrase = latex_formula_re.sub(' ', phrase)
        phrase = wash_for_utf8(phrase)
        phrase = lower_index_term(phrase)
        # 1st split phrase into blocks according to whitespace
        last_word = ''
        for block in strip_accents(phrase).split():
            # 2nd remove leading/trailing punctuation and add block:
            block = re_block_punctuation_begin.sub("", block)
            block = re_block_punctuation_end.sub("", block)
            if block:
                block = remove_stopwords(block, self.remove_stopwords)
                block = length_check(block)
                block = apply_stemming(block, self.stemming_language)
                # 3rd break each block into subblocks according to punctuation and add subblocks:
                for subblock in re_punctuation.split(block):
                    subblock = remove_stopwords(subblock,
                                                self.remove_stopwords)
                    subblock = length_check(subblock)
                    subblock = apply_stemming(subblock, self.stemming_language)
                    if subblock:
                        # 4th break each subblock into alphanumeric groups and add groups:
                        for alphanumeric_group in re_separators.split(
                                subblock):
                            alphanumeric_group = remove_stopwords(
                                alphanumeric_group, self.remove_stopwords)
                            alphanumeric_group = length_check(
                                alphanumeric_group)
                            alphanumeric_group = apply_stemming(
                                alphanumeric_group, self.stemming_language)
                            if alphanumeric_group:
                                if last_word:
                                    words['%s %s' %
                                          (last_word, alphanumeric_group)] = 1
                                last_word = alphanumeric_group
        return words.keys()
    def tokenize_for_pairs(self, phrase):
        """Return list of words found in PHRASE.  Note that the phrase is
           split into groups depending on the alphanumeric characters and
           punctuation characters definition present in the config file.
        """

        words = {}
        if self.remove_html_markup and phrase.find("</") > -1:
            phrase = remove_html_markup(phrase)
        if self.remove_latex_markup:
            phrase = remove_latex_markup(phrase)
            phrase = latex_formula_re.sub(' ', phrase)
        phrase = wash_for_utf8(phrase)
        phrase = lower_index_term(phrase)
        # 1st split phrase into blocks according to whitespace
        last_word = ''
        for block in strip_accents(phrase).split():
            # 2nd remove leading/trailing punctuation and add block:
            block = re_block_punctuation_begin.sub("", block)
            block = re_block_punctuation_end.sub("", block)
            if block:
                block = remove_stopwords(block, self.remove_stopwords)
                block = length_check(block)
                block = apply_stemming(block, self.stemming_language)
                # 3rd break each block into subblocks according to punctuation and add subblocks:
                for subblock in re_punctuation.split(block):
                    subblock = remove_stopwords(subblock, self.remove_stopwords)
                    subblock = length_check(subblock)
                    subblock = apply_stemming(subblock, self.stemming_language)
                    if subblock:
                        # 4th break each subblock into alphanumeric groups and add groups:
                        for alphanumeric_group in re_separators.split(subblock):
                            alphanumeric_group = remove_stopwords(alphanumeric_group, self.remove_stopwords)
                            alphanumeric_group = length_check(alphanumeric_group)
                            alphanumeric_group = apply_stemming(alphanumeric_group, self.stemming_language)
                            if alphanumeric_group:
                                if last_word:
                                    words['%s %s' % (last_word, alphanumeric_group)] = 1
                                last_word = alphanumeric_group
        return words.keys()
示例#6
0
def create_nearest_terms_box(urlargd,
                             p,
                             f,
                             t='w',
                             n=5,
                             ln=None,
                             intro_text_p=True):
    """Return text box containing list of 'n' nearest terms above/below 'p'.

    Look in the field 'f' for matching type 't' (words/phrases) in language
    'ln'.  Propose new searches according to `urlargs' with the new words.  If
    `intro_text_p' is true, then display the introductory message, otherwise
    print only the nearest terms in the box content.
    """
    # load the right message language
    _ = gettext_set_language(ln or g.ln)

    if not CFG_WEBSEARCH_DISPLAY_NEAREST_TERMS:
        return _("Your search did not match any records.  Please try again.")
    nearest_terms = []
    if not p:  # sanity check
        p = "."
    if p.startswith('%') and p.endswith('%'):
        p = p[1:-1]  # fix for partial phrase
    index_id = get_index_id_from_field(f)
    if f == 'fulltext':
        if CFG_SOLR_URL:
            return _("No match found, please enter different search terms.")
        else:
            # FIXME: workaround for not having native phrase index yet
            t = 'w'
    # special indexes:
    if f == 'refersto' or f == 'referstoexcludingselfcites':
        return _("There are no records referring to %(x_rec)s.",
                 x_rec=cgi.escape(p))
    if f == 'cataloguer':
        return _("There are no records modified by %(x_rec)s.",
                 x_rec=cgi.escape(p))
    if f == 'citedby' or f == 'citedbyexcludingselfcites':
        return _("There are no records cited by %(x_rec)s.",
                 x_rec=cgi.escape(p))
    # look for nearest terms:
    if t == 'w':
        nearest_terms = get_nearest_terms_in_bibwords(p, f, n, n)
        if not nearest_terms:
            return _(
                "No word index is available for %(x_name)s.",
                x_name=('<em>' + cgi.escape(
                    get_field_i18nname(get_field_name(f) or f, ln, False)) +
                        '</em>'))
    else:
        nearest_terms = []
        if index_id:
            nearest_terms = get_nearest_terms_in_idxphrase(p, index_id, n, n)
        if f == 'datecreated' or f == 'datemodified':
            nearest_terms = get_nearest_terms_in_bibrec(p, f, n, n)
        if not nearest_terms:
            nearest_terms = get_nearest_terms_in_bibxxx(p, f, n, n)
        if not nearest_terms:
            return _(
                "No phrase index is available for %(x_name)s.",
                x_name=('<em>' + cgi.escape(
                    get_field_i18nname(get_field_name(f) or f, ln, False)) +
                        '</em>'))

    terminfo = []
    for term in nearest_terms:
        if t == 'w':
            hits = get_nbhits_in_bibwords(term, f)
        else:
            if index_id:
                hits = get_nbhits_in_idxphrases(term, f)
            elif f == 'datecreated' or f == 'datemodified':
                hits = get_nbhits_in_bibrec(term, f)
            else:
                hits = get_nbhits_in_bibxxx(term, f)

        argd = {}
        argd.update(urlargd)

        # check which fields contained the requested parameter, and replace it.
        for px, dummy_fx in ('p', 'f'), ('p1', 'f1'), ('p2', 'f2'), ('p3',
                                                                     'f3'):
            if px in argd:
                argd_px = argd[px]
                if t == 'w':
                    # p was stripped of accents, to do the same:
                    argd_px = strip_accents(argd_px)
                #argd[px] = string.replace(argd_px, p, term, 1)
                #we need something similar, but case insensitive
                pattern_index = string.find(argd_px.lower(), p.lower())
                if pattern_index > -1:
                    argd[px] = argd_px[:pattern_index] + term + argd_px[
                        pattern_index + len(p):]
                    break
                #this is doing exactly the same as:
                #argd[px] = re.sub('(?i)' + re.escape(p), term, argd_px, 1)
                #but is ~4x faster (2us vs. 8.25us)
        terminfo.append((term, hits, argd))

    intro = ""
    if intro_text_p:  # add full leading introductory text
        if f:
            intro = _("Search term %(x_term)s inside index %(x_index)s did not match any record. Nearest terms in any collection are:") % \
                     {'x_term': "<em>" + cgi.escape(p.startswith("%") and p.endswith("%") and p[1:-1] or p) + "</em>",
                      'x_index': "<em>" + cgi.escape(get_field_i18nname(get_field_name(f) or f, ln, False)) + "</em>"}
        else:
            intro = _(
                "Search term %(x_name)s did not match any record. Nearest terms in any collection are:",
                x_name=("<em>" + cgi.escape(
                    p.startswith("%") and p.endswith("%") and p[1:-1] or p) +
                        "</em>"))

    return websearch_templates.tmpl_nearest_term_box(p=p,
                                                     ln=ln,
                                                     f=f,
                                                     terminfo=terminfo,
                                                     intro=intro)
示例#7
0
def get_nearest_terms_in_bibxxx(p, f, n_below, n_above):
    """Browse (-n_above, +n_below) closest bibliographic phrases
       for the given pattern p in the given field f, regardless
       of collection.
       Return list of [phrase1, phrase2, ... , phrase_n]."""
    # determine browse field:
    if not f and string.find(p, ":") > 0:  # does 'p' contain ':'?
        f, p = string.split(p, ":", 1)

    # FIXME: quick hack for the journal index
    if f == 'journal':
        return get_nearest_terms_in_bibwords(p, f, n_below, n_above)

    # We are going to take max(n_below, n_above) as the number of
    # values to ferch from bibXXx.  This is needed to work around
    # MySQL UTF-8 sorting troubles in 4.0.x.  Proper solution is to
    # use MySQL 4.1.x or our own idxPHRASE in the future.

    index_id = get_index_id_from_field(f)
    if index_id:
        return get_nearest_terms_in_idxphrase(p, index_id, n_below, n_above)

    n_fetch = 2 * max(n_below, n_above)
    # construct 'tl' which defines the tag list (MARC tags) to search in:
    tl = []
    if str(f[0]).isdigit() and str(f[1]).isdigit():
        tl.append(f)  # 'f' seems to be okay as it starts by two digits
    else:
        # deduce desired MARC tags on the basis of chosen 'f'
        tl = get_field_tags(f)
    # start browsing to fetch list of hits:
    browsed_phrases = {}
    # will hold {phrase1: 1, phrase2: 1, ..., phraseN: 1} dict of browsed
    # phrases (to make them unique)

    # always add self to the results set:
    browsed_phrases[p.startswith("%") and p.endswith("%") and p[1:-1] or p] = 1
    for t in tl:
        # deduce into which bibxxx table we will search:
        digit1, digit2 = int(t[0]), int(t[1])
        model = getattr(models, 'Bib{0}{1}x'.format(digit1, digit2))

        if len(t) != 6 or t[-1:] == '%':
            # only the beginning of field 't' is defined, so add wildcard
            # character:
            condition = model.tag.like(t + '%')
        else:
            condition = model.tag == t

        res = set([
            item[0] for item in model.query.filter(model.value < p, condition).
            order_by(model.value.desc()).limit(n_fetch).values(model.value)
        ])
        res |= set([
            item[0] for item in model.query.filter(model.value > p, condition).
            order_by(model.value.asc()).limit(n_fetch).values(model.value)
        ])

    # select first n words only: (this is needed as we were searching
    # in many different tables and so aren't sure we have more than n
    # words right; this of course won't be needed when we shall have
    # one ACC table only for given field):
    phrases_out = list(res)
    phrases_out.sort(lambda x, y: cmp(string.lower(strip_accents(x)),
                                      string.lower(strip_accents(y))))
    # find position of self:
    try:
        idx_p = phrases_out.index(p)
    except ValueError:
        idx_p = len(phrases_out) / 2
    # return n_above and n_below:
    return phrases_out[max(0, idx_p - n_above):idx_p + n_below]
示例#8
0
def sort_records_bibxxx(recIDs,
                        tags,
                        sort_field='',
                        sort_order='d',
                        sort_pattern='',
                        rg=None,
                        jrec=None):
    """Sort record list according sort field in given order.

    If more than one instance of 'sort_field' is found for a given record, try
    to choose that that is given by 'sort pattern', for example "sort by report
    number that starts by CERN-PS".  Note that 'sort_field' can be field code
    like 'author' or MARC tag like '100__a' directly.
    """
    # check arguments:
    if not sort_field:
        return slice_records(recIDs, jrec, rg)

    if len(recIDs) > cfg['CFG_WEBSEARCH_NB_RECORDS_TO_SORT']:
        return slice_records(recIDs, jrec, rg)

    recIDs_dict = {}
    recIDs_out = []

    if not tags:
        # tags have not been camputed yet
        sort_fields = sort_field.split(',')
        tags, error_field = get_tags_from_sort_fields(sort_fields)
        if error_field:
            return slice_records(recIDs, jrec, rg)

    # check if we have sorting tag defined:
    if tags:
        # fetch the necessary field values:
        for recID in recIDs:
            val = ""  # will hold value for recID according to which sort
            vals = []  # will hold all values found in sorting tag for recID
            for tag in tags:
                if cfg['CFG_CERN_SITE'] and tag == '773__c':
                    # CERN hack: journal sorting
                    # 773__c contains page numbers, e.g. 3-13,
                    # and we want to sort by 3, and numerically:
                    vals.extend([
                        "%050s" % x.split("-", 1)[0]
                        for x in get_fieldvalues(recID, tag)
                    ])
                else:
                    vals.extend(get_fieldvalues(recID, tag))
            if sort_pattern:
                # try to pick that tag value that corresponds to sort pattern
                bingo = 0
                for v in vals:
                    if v.lower().startswith(sort_pattern.lower()):  # bingo!
                        bingo = 1
                        val = v
                        break
                if not bingo:
                    # sort_pattern not present, so add other vals after spaces
                    val = sort_pattern + "          " + ''.join(vals)
            else:
                # no sort pattern defined, so join them all together
                val = ''.join(vals)
            # sort values regardless of accents and case
            val = strip_accents(val.lower())
            if val in recIDs_dict:
                recIDs_dict[val].append(recID)
            else:
                recIDs_dict[val] = [recID]

        # create output array:
        for k in sorted(recIDs_dict.keys()):
            recIDs_out.extend(recIDs_dict[k])

        # ascending or descending?
        if sort_order == 'd':
            recIDs_out.reverse()

        recIDs = recIDs_out

    # return only up to the maximum that we need
    return slice_records(recIDs, jrec, rg)
示例#9
0
 def _sort_case_insensitive_strip_accents(self, val):
     """Remove accents and convert to lower case"""
     if not val:
         return ''
     return strip_accents(str(val).lower())
示例#10
0
def create_nearest_terms_box(urlargd, p, f, t='w', n=5, ln=None,
                             intro_text_p=True):
    """Return text box containing list of 'n' nearest terms above/below 'p'.

    Look in the field 'f' for matching type 't' (words/phrases) in language
    'ln'.  Propose new searches according to `urlargs' with the new words.  If
    `intro_text_p' is true, then display the introductory message, otherwise
    print only the nearest terms in the box content.
    """
    # load the right message language
    _ = gettext_set_language(ln or g.ln)

    if not CFG_WEBSEARCH_DISPLAY_NEAREST_TERMS:
        return _("Your search did not match any records.  Please try again.")
    nearest_terms = []
    if not p: # sanity check
        p = "."
    if p.startswith('%') and p.endswith('%'):
        p = p[1:-1] # fix for partial phrase
    index_id = get_index_id_from_field(f)
    if f == 'fulltext':
        if CFG_SOLR_URL:
            return _("No match found, please enter different search terms.")
        else:
            # FIXME: workaround for not having native phrase index yet
            t = 'w'
    # special indexes:
    if f == 'refersto' or f == 'referstoexcludingselfcites':
        return _("There are no records referring to %(x_rec)s.", x_rec=cgi.escape(p))
    if f == 'cataloguer':
        return _("There are no records modified by %(x_rec)s.", x_rec=cgi.escape(p))
    if f == 'citedby' or f == 'citedbyexcludingselfcites':
        return _("There are no records cited by %(x_rec)s.", x_rec=cgi.escape(p))
    # look for nearest terms:
    if t == 'w':
        nearest_terms = get_nearest_terms_in_bibwords(p, f, n, n)
        if not nearest_terms:
            return _("No word index is available for %(x_name)s.",
                   x_name=('<em>' + cgi.escape(get_field_i18nname(get_field_name(f) or f, ln, False)) + '</em>'))
    else:
        nearest_terms = []
        if index_id:
            nearest_terms = get_nearest_terms_in_idxphrase(p, index_id, n, n)
        if f == 'datecreated' or f == 'datemodified':
            nearest_terms = get_nearest_terms_in_bibrec(p, f, n, n)
        if not nearest_terms:
            nearest_terms = get_nearest_terms_in_bibxxx(p, f, n, n)
        if not nearest_terms:
            return _("No phrase index is available for %(x_name)s.",
                   x_name=('<em>' + cgi.escape(get_field_i18nname(get_field_name(f) or f, ln, False)) + '</em>'))

    terminfo = []
    for term in nearest_terms:
        if t == 'w':
            hits = get_nbhits_in_bibwords(term, f)
        else:
            if index_id:
                hits = get_nbhits_in_idxphrases(term, f)
            elif f == 'datecreated' or f == 'datemodified':
                hits = get_nbhits_in_bibrec(term, f)
            else:
                hits = get_nbhits_in_bibxxx(term, f)

        argd = {}
        argd.update(urlargd)

        # check which fields contained the requested parameter, and replace it.
        for px, dummy_fx in ('p', 'f'), ('p1', 'f1'), ('p2', 'f2'), ('p3', 'f3'):
            if px in argd:
                argd_px = argd[px]
                if t == 'w':
                    # p was stripped of accents, to do the same:
                    argd_px = strip_accents(argd_px)
                #argd[px] = string.replace(argd_px, p, term, 1)
                #we need something similar, but case insensitive
                pattern_index = string.find(argd_px.lower(), p.lower())
                if pattern_index > -1:
                    argd[px] = argd_px[:pattern_index] + term + argd_px[pattern_index+len(p):]
                    break
                #this is doing exactly the same as:
                #argd[px] = re.sub('(?i)' + re.escape(p), term, argd_px, 1)
                #but is ~4x faster (2us vs. 8.25us)
        terminfo.append((term, hits, argd))

    intro = ""
    if intro_text_p: # add full leading introductory text
        if f:
            intro = _("Search term %(x_term)s inside index %(x_index)s did not match any record. Nearest terms in any collection are:") % \
                     {'x_term': "<em>" + cgi.escape(p.startswith("%") and p.endswith("%") and p[1:-1] or p) + "</em>",
                      'x_index': "<em>" + cgi.escape(get_field_i18nname(get_field_name(f) or f, ln, False)) + "</em>"}
        else:
            intro = _("Search term %(x_name)s did not match any record. Nearest terms in any collection are:",
                     x_name=("<em>" + cgi.escape(p.startswith("%") and p.endswith("%") and p[1:-1] or p) + "</em>"))

    return websearch_templates.tmpl_nearest_term_box(p=p, ln=ln, f=f, terminfo=terminfo,
                                                     intro=intro)
示例#11
0
def get_nearest_terms_in_bibxxx(p, f, n_below, n_above):
    """Browse (-n_above, +n_below) closest bibliographic phrases
       for the given pattern p in the given field f, regardless
       of collection.
       Return list of [phrase1, phrase2, ... , phrase_n]."""
    # determine browse field:
    if not f and string.find(p, ":") > 0:  # does 'p' contain ':'?
        f, p = string.split(p, ":", 1)

    # FIXME: quick hack for the journal index
    if f == 'journal':
        return get_nearest_terms_in_bibwords(p, f, n_below, n_above)

    # We are going to take max(n_below, n_above) as the number of
    # values to ferch from bibXXx.  This is needed to work around
    # MySQL UTF-8 sorting troubles in 4.0.x.  Proper solution is to
    # use MySQL 4.1.x or our own idxPHRASE in the future.

    index_id = get_index_id_from_field(f)
    if index_id:
        return get_nearest_terms_in_idxphrase(p, index_id, n_below, n_above)

    n_fetch = 2 * max(n_below, n_above)
    # construct 'tl' which defines the tag list (MARC tags) to search in:
    tl = []
    if str(f[0]).isdigit() and str(f[1]).isdigit():
        tl.append(f)  # 'f' seems to be okay as it starts by two digits
    else:
        # deduce desired MARC tags on the basis of chosen 'f'
        tl = get_field_tags(f)
    # start browsing to fetch list of hits:
    browsed_phrases = {}
    # will hold {phrase1: 1, phrase2: 1, ..., phraseN: 1} dict of browsed
    # phrases (to make them unique)

    # always add self to the results set:
    browsed_phrases[p.startswith("%") and p.endswith("%") and p[1:-1] or p] = 1
    for t in tl:
        # deduce into which bibxxx table we will search:
        digit1, digit2 = int(t[0]), int(t[1])
        model = getattr(models, 'Bib{0}{1}x'.format(digit1, digit2))

        if len(t) != 6 or t[-1:] == '%':
            # only the beginning of field 't' is defined, so add wildcard
            # character:
            condition = model.tag.like(t + '%')
        else:
            condition = model.tag == t

        res = set([item[0] for item in model.query.filter(
            model.value < p, condition
        ).order_by(model.value.desc()).limit(n_fetch).values(model.value)])
        res |= set([item[0] for item in model.query.filter(
            model.value > p, condition
        ).order_by(model.value.asc()).limit(n_fetch).values(model.value)])

    # select first n words only: (this is needed as we were searching
    # in many different tables and so aren't sure we have more than n
    # words right; this of course won't be needed when we shall have
    # one ACC table only for given field):
    phrases_out = list(res)
    phrases_out.sort(lambda x, y: cmp(string.lower(strip_accents(x)),
                                      string.lower(strip_accents(y))))
    # find position of self:
    try:
        idx_p = phrases_out.index(p)
    except ValueError:
        idx_p = len(phrases_out)/2
    # return n_above and n_below:
    return phrases_out[max(0, idx_p-n_above):idx_p+n_below]
示例#12
0
文件: engine.py 项目: SCOAP3/invenio
def sort_records_bibxxx(recIDs, tags, sort_field='', sort_order='d',
                        sort_pattern='', rg=None, jrec=None):
    """Sort record list according sort field in given order.

    If more than one instance of 'sort_field' is found for a given record, try
    to choose that that is given by 'sort pattern', for example "sort by report
    number that starts by CERN-PS".  Note that 'sort_field' can be field code
    like 'author' or MARC tag like '100__a' directly.
    """
    # check arguments:
    if not sort_field:
        return slice_records(recIDs, jrec, rg)

    if len(recIDs) > cfg['CFG_WEBSEARCH_NB_RECORDS_TO_SORT']:
        return slice_records(recIDs, jrec, rg)

    recIDs_dict = {}
    recIDs_out = []

    if not tags:
        # tags have not been camputed yet
        sort_fields = sort_field.split(',')
        tags, error_field = get_tags_from_sort_fields(sort_fields)
        if error_field:
            return slice_records(recIDs, jrec, rg)

    # check if we have sorting tag defined:
    if tags:
        # fetch the necessary field values:
        for recID in recIDs:
            val = ""  # will hold value for recID according to which sort
            vals = []  # will hold all values found in sorting tag for recID
            for tag in tags:
                if cfg['CFG_CERN_SITE'] and tag == '773__c':
                    # CERN hack: journal sorting
                    # 773__c contains page numbers, e.g. 3-13,
                    # and we want to sort by 3, and numerically:
                    vals.extend([
                        "%050s" % x.split("-", 1)[0]
                        for x in get_fieldvalues(recID, tag)])
                else:
                    vals.extend(get_fieldvalues(recID, tag))
            if sort_pattern:
                # try to pick that tag value that corresponds to sort pattern
                bingo = 0
                for v in vals:
                    if v.lower().startswith(sort_pattern.lower()):  # bingo!
                        bingo = 1
                        val = v
                        break
                if not bingo:
                    # sort_pattern not present, so add other vals after spaces
                    val = sort_pattern + "          " + ''.join(vals)
            else:
                # no sort pattern defined, so join them all together
                val = ''.join(vals)
            # sort values regardless of accents and case
            val = strip_accents(val.lower())
            if val in recIDs_dict:
                recIDs_dict[val].append(recID)
            else:
                recIDs_dict[val] = [recID]

        # create output array:
        for k in sorted(recIDs_dict.keys()):
            recIDs_out.extend(recIDs_dict[k])

        # ascending or descending?
        if sort_order == 'd':
            recIDs_out.reverse()

        recIDs = recIDs_out

    # return only up to the maximum that we need
    return slice_records(recIDs, jrec, rg)