Пример #1
0
def sort_records(recIDs,
                 sort_field='',
                 sort_order='a',
                 sort_pattern='',
                 rg=None,
                 jrec=None,
                 sorting_methods=None):
    """Initial entry point for sorting records, acts like a dispatcher.

    1. sort_field is in the bsrMETHOD, and thus, the BibSort has sorted the
       data for this field, so we can use the cache;

    2. sort_field is not in bsrMETHOD, and thus, the cache does not contain
       any information regarding this sorting method.
    """
    sorting_methods = sorting_methods or SORTING_METHODS

    # bibsort does not handle sort_pattern for now, use bibxxx
    if sort_pattern:
        return sort_records_bibxxx(recIDs,
                                   None,
                                   sort_field,
                                   sort_order,
                                   sort_pattern,
                                   rg=rg,
                                   jrec=jrec)

    # ignore the use of buckets, use old fashion sorting
    use_sorting_buckets = cfg['CFG_BIBSORT_ENABLED']

    # Default sorting
    if not sort_field:
        if use_sorting_buckets:
            return sort_records_bibsort(recIDs,
                                        cfg['CFG_BIBSORT_DEFAULT_FIELD'],
                                        sort_field,
                                        cfg['CFG_BIBSORT_DEFAULT_FIELD_ORDER'],
                                        rg, jrec)
        else:
            if sort_order == 'd':
                recIDs.reverse()
            return slice_records(recIDs, jrec, rg)

    sort_fields = sort_field.split(",")
    if len(sort_fields) == 1:
        # we have only one sorting_field, check if it is treated by BibSort
        for sort_method in sorting_methods:
            definition = sorting_methods[sort_method]
            if use_sorting_buckets and (
                (definition.startswith('FIELD') and definition.replace(
                    'FIELD:', '').strip().lower() == sort_fields[0].lower())
                    or sort_method == sort_fields[0]):
                # use BibSort
                return sort_records_bibsort(recIDs, sort_method, sort_field,
                                            sort_order, rg, jrec)
    # deduce sorting MARC tag out of the 'sort_field' argument:
    tags, error_field = get_tags_from_sort_fields(sort_fields)
    if error_field:
        if use_sorting_buckets:
            return sort_records_bibsort(recIDs,
                                        cfg['CFG_BIBSORT_DEFAULT_FIELD'],
                                        sort_field, sort_order, rg, jrec)
        else:
            return slice_records(recIDs, jrec, rg)
    elif tags:
        for sort_method in sorting_methods:
            definition = sorting_methods[sort_method]
            if definition.startswith('MARC') and \
                    definition.replace(
                        'MARC:', '').strip().split(',') == tags \
                    and use_sorting_buckets:
                # this list of tags have a designated method in BibSort
                return sort_records_bibsort(recIDs, sort_method, sort_field,
                                            sort_order, rg, jrec)
        # we do not have this sort_field in BibSort tables
        # -> do the old fashion sorting
        return sort_records_bibxxx(recIDs, tags, sort_field, sort_order,
                                   sort_pattern, rg, jrec)
    else:
        return slice_records(recIDs, jrec, rg)
Пример #2
0
def sort_records_bibxxx(recIDs,
                        tags,
                        sort_field='',
                        sort_order='d',
                        sort_pattern='',
                        rg=None,
                        jrec=None):
    """Sort record list according sort field in given order.

    If more than one instance of 'sort_field' is found for a given record, try
    to choose that that is given by 'sort pattern', for example "sort by report
    number that starts by CERN-PS".  Note that 'sort_field' can be field code
    like 'author' or MARC tag like '100__a' directly.
    """
    # check arguments:
    if not sort_field:
        return slice_records(recIDs, jrec, rg)

    if len(recIDs) > cfg['CFG_WEBSEARCH_NB_RECORDS_TO_SORT']:
        return slice_records(recIDs, jrec, rg)

    recIDs_dict = {}
    recIDs_out = []

    if not tags:
        # tags have not been camputed yet
        sort_fields = sort_field.split(',')
        tags, error_field = get_tags_from_sort_fields(sort_fields)
        if error_field:
            return slice_records(recIDs, jrec, rg)

    # check if we have sorting tag defined:
    if tags:
        # fetch the necessary field values:
        for recID in recIDs:
            val = ""  # will hold value for recID according to which sort
            vals = []  # will hold all values found in sorting tag for recID
            for tag in tags:
                if cfg['CFG_CERN_SITE'] and tag == '773__c':
                    # CERN hack: journal sorting
                    # 773__c contains page numbers, e.g. 3-13,
                    # and we want to sort by 3, and numerically:
                    vals.extend([
                        "%050s" % x.split("-", 1)[0]
                        for x in get_fieldvalues(recID, tag)
                    ])
                else:
                    vals.extend(get_fieldvalues(recID, tag))
            if sort_pattern:
                # try to pick that tag value that corresponds to sort pattern
                bingo = 0
                for v in vals:
                    if v.lower().startswith(sort_pattern.lower()):  # bingo!
                        bingo = 1
                        val = v
                        break
                if not bingo:
                    # sort_pattern not present, so add other vals after spaces
                    val = sort_pattern + "          " + ''.join(vals)
            else:
                # no sort pattern defined, so join them all together
                val = ''.join(vals)
            # sort values regardless of accents and case
            val = strip_accents(val.lower())
            if val in recIDs_dict:
                recIDs_dict[val].append(recID)
            else:
                recIDs_dict[val] = [recID]

        # create output array:
        for k in sorted(recIDs_dict.keys()):
            recIDs_out.extend(recIDs_dict[k])

        # ascending or descending?
        if sort_order == 'd':
            recIDs_out.reverse()

        recIDs = recIDs_out

    # return only up to the maximum that we need
    return slice_records(recIDs, jrec, rg)
Пример #3
0
def sort_records_bibsort(recIDs,
                         sort_method,
                         sort_field='',
                         sort_order='d',
                         rg=None,
                         jrec=1,
                         sort_or_rank='s',
                         sorting_methods=SORTING_METHODS):
    """Order the list based on a sorting method using the BibSortDataCacher."""
    sorting_methods = sorting_methods or SORTING_METHODS

    if not jrec:
        jrec = 1

    # sanity check
    if sort_method not in sorting_methods:
        if sort_or_rank == 'r':
            return rank_records(rank_method_code=sort_method,
                                rank_limit_relevance=0,
                                hitset=recIDs)
        else:
            return sort_records_bibxxx(recIDs, None, sort_field, sort_order,
                                       '', rg, jrec)

    # we should return sorted records up to irec_max(exclusive)
    dummy, irec_max = get_interval_for_records_to_sort(len(recIDs), jrec, rg)
    solution = intbitset()
    input_recids = intbitset(recIDs)
    CACHE_SORTED_DATA[sort_method].recreate_cache_if_needed()
    sort_cache = CACHE_SORTED_DATA[sort_method].cache
    bucket_numbers = sort_cache['bucket_data'].keys()
    # check if all buckets have been constructed
    if len(bucket_numbers) != cfg['CFG_BIBSORT_BUCKETS']:
        if sort_or_rank == 'r':
            return rank_records(rank_method_code=sort_method,
                                rank_limit_relevance=0,
                                hitset=recIDs)
        else:
            return sort_records_bibxxx(recIDs,
                                       None,
                                       sort_field,
                                       sort_order,
                                       rg=rg,
                                       jrec=jrec)

    if sort_order == 'd':
        bucket_numbers.reverse()
    for bucket_no in bucket_numbers:
        solution.union_update(input_recids
                              & sort_cache['bucket_data'][bucket_no])
        if len(solution) >= irec_max:
            break

    dict_solution = {}
    missing_records = intbitset()
    for recid in solution:
        try:
            dict_solution[recid] = sort_cache['data_dict_ordered'][recid]
        except KeyError:
            # recid is in buckets, but not in the bsrMETHODDATA,
            # maybe because the value has been deleted, but the change has not
            # yet been propagated to the buckets
            missing_records.add(recid)

    # check if there are recids that are not in any bucket -> to be added at
    # the end/top, ordered by insertion date
    if len(solution) < irec_max:
        # some records have not been yet inserted in the bibsort structures
        # or, some records have no value for the sort_method
        missing_records += input_recids - solution

    reverse = sort_order == 'd'

    if sort_method.strip().lower() == cfg['CFG_BIBSORT_DEFAULT_FIELD'] and \
            reverse:
        # If we want to sort the records on their insertion date, add the
        # missing records at the top.
        solution = sorted(missing_records, reverse=True) + \
            sorted(dict_solution, key=dict_solution.__getitem__, reverse=True)
    else:
        solution = sorted(dict_solution,
                          key=dict_solution.__getitem__,
                          reverse=reverse) + sorted(missing_records)

    # Only keep records, we are going to display
    solution = slice_records(solution, jrec, rg)

    if sort_or_rank == 'r':
        # We need the recids, with their ranking score
        return solution, [dict_solution.get(record, 0) for record in solution]
    else:
        return solution
Пример #4
0
def sort_records(recIDs, sort_field='', sort_order='a', sort_pattern='',
                 rg=None, jrec=None, sorting_methods=None):
    """Initial entry point for sorting records, acts like a dispatcher.

    1. sort_field is in the bsrMETHOD, and thus, the BibSort has sorted the
       data for this field, so we can use the cache;

    2. sort_field is not in bsrMETHOD, and thus, the cache does not contain
       any information regarding this sorting method.
    """
    sorting_methods = sorting_methods or SORTING_METHODS

    # bibsort does not handle sort_pattern for now, use bibxxx
    if sort_pattern:
        return sort_records_bibxxx(recIDs, None, sort_field, sort_order,
                                   sort_pattern, rg=rg, jrec=jrec)

    # ignore the use of buckets, use old fashion sorting
    use_sorting_buckets = cfg['CFG_BIBSORT_ENABLED']

    # Default sorting
    if not sort_field:
        if use_sorting_buckets:
            return sort_records_bibsort(
                recIDs, cfg['CFG_BIBSORT_DEFAULT_FIELD'], sort_field,
                cfg['CFG_BIBSORT_DEFAULT_FIELD_ORDER'], rg, jrec)
        else:
            if sort_order == 'd':
                recIDs.reverse()
            return slice_records(recIDs, jrec, rg)

    sort_fields = sort_field.split(",")
    if len(sort_fields) == 1:
        # we have only one sorting_field, check if it is treated by BibSort
        for sort_method in sorting_methods:
            definition = sorting_methods[sort_method]
            if use_sorting_buckets and ((
                    definition.startswith('FIELD') and definition.replace(
                        'FIELD:', '').strip().lower() == sort_fields[0].lower()
                    ) or sort_method == sort_fields[0]):
                # use BibSort
                return sort_records_bibsort(recIDs, sort_method, sort_field,
                                            sort_order, rg, jrec)
    # deduce sorting MARC tag out of the 'sort_field' argument:
    tags, error_field = get_tags_from_sort_fields(sort_fields)
    if error_field:
        if use_sorting_buckets:
            return sort_records_bibsort(
                recIDs, cfg['CFG_BIBSORT_DEFAULT_FIELD'], sort_field,
                sort_order, rg, jrec)
        else:
            return slice_records(recIDs, jrec, rg)
    elif tags:
        for sort_method in sorting_methods:
            definition = sorting_methods[sort_method]
            if definition.startswith('MARC') and \
                    definition.replace(
                        'MARC:', '').strip().split(',') == tags \
                    and use_sorting_buckets:
                # this list of tags have a designated method in BibSort
                return sort_records_bibsort(recIDs, sort_method, sort_field,
                                            sort_order, rg, jrec)
        # we do not have this sort_field in BibSort tables
        # -> do the old fashion sorting
        return sort_records_bibxxx(recIDs, tags, sort_field, sort_order,
                                   sort_pattern, rg, jrec)
    else:
        return slice_records(recIDs, jrec, rg)
Пример #5
0
def sort_records_bibxxx(recIDs, tags, sort_field='', sort_order='d',
                        sort_pattern='', rg=None, jrec=None):
    """Sort record list according sort field in given order.

    If more than one instance of 'sort_field' is found for a given record, try
    to choose that that is given by 'sort pattern', for example "sort by report
    number that starts by CERN-PS".  Note that 'sort_field' can be field code
    like 'author' or MARC tag like '100__a' directly.
    """
    # check arguments:
    if not sort_field:
        return slice_records(recIDs, jrec, rg)

    if len(recIDs) > cfg['CFG_WEBSEARCH_NB_RECORDS_TO_SORT']:
        return slice_records(recIDs, jrec, rg)

    recIDs_dict = {}
    recIDs_out = []

    if not tags:
        # tags have not been camputed yet
        sort_fields = sort_field.split(',')
        tags, error_field = get_tags_from_sort_fields(sort_fields)
        if error_field:
            return slice_records(recIDs, jrec, rg)

    # check if we have sorting tag defined:
    if tags:
        # fetch the necessary field values:
        for recID in recIDs:
            val = ""  # will hold value for recID according to which sort
            vals = []  # will hold all values found in sorting tag for recID
            for tag in tags:
                if cfg['CFG_CERN_SITE'] and tag == '773__c':
                    # CERN hack: journal sorting
                    # 773__c contains page numbers, e.g. 3-13,
                    # and we want to sort by 3, and numerically:
                    vals.extend([
                        "%050s" % x.split("-", 1)[0]
                        for x in get_fieldvalues(recID, tag)])
                else:
                    vals.extend(get_fieldvalues(recID, tag))
            if sort_pattern:
                # try to pick that tag value that corresponds to sort pattern
                bingo = 0
                for v in vals:
                    if v.lower().startswith(sort_pattern.lower()):  # bingo!
                        bingo = 1
                        val = v
                        break
                if not bingo:
                    # sort_pattern not present, so add other vals after spaces
                    val = sort_pattern + "          " + ''.join(vals)
            else:
                # no sort pattern defined, so join them all together
                val = ''.join(vals)
            # sort values regardless of accents and case
            val = strip_accents(val.lower())
            if val in recIDs_dict:
                recIDs_dict[val].append(recID)
            else:
                recIDs_dict[val] = [recID]

        # create output array:
        for k in sorted(recIDs_dict.keys()):
            recIDs_out.extend(recIDs_dict[k])

        # ascending or descending?
        if sort_order == 'd':
            recIDs_out.reverse()

        recIDs = recIDs_out

    # return only up to the maximum that we need
    return slice_records(recIDs, jrec, rg)
Пример #6
0
def sort_records_bibsort(recIDs, sort_method, sort_field='', sort_order='d',
                         rg=None, jrec=1, sort_or_rank='s',
                         sorting_methods=SORTING_METHODS):
    """Order the list based on a sorting method using the BibSortDataCacher."""
    sorting_methods = sorting_methods or SORTING_METHODS

    if not jrec:
        jrec = 1

    # sanity check
    if sort_method not in sorting_methods:
        if sort_or_rank == 'r':
            return rank_records(rank_method_code=sort_method,
                                rank_limit_relevance=0,
                                hitset=recIDs)
        else:
            return sort_records_bibxxx(recIDs, None, sort_field, sort_order,
                                       '', rg, jrec)

    # we should return sorted records up to irec_max(exclusive)
    dummy, irec_max = get_interval_for_records_to_sort(len(recIDs), jrec, rg)
    solution = intbitset()
    input_recids = intbitset(recIDs)
    CACHE_SORTED_DATA[sort_method].recreate_cache_if_needed()
    sort_cache = CACHE_SORTED_DATA[sort_method].cache
    bucket_numbers = sort_cache['bucket_data'].keys()
    # check if all buckets have been constructed
    if len(bucket_numbers) != cfg['CFG_BIBSORT_BUCKETS']:
        if sort_or_rank == 'r':
            return rank_records(rank_method_code=sort_method,
                                rank_limit_relevance=0, hitset=recIDs)
        else:
            return sort_records_bibxxx(recIDs, None, sort_field,
                                       sort_order, rg=rg, jrec=jrec)

    if sort_order == 'd':
        bucket_numbers.reverse()
    for bucket_no in bucket_numbers:
        solution.union_update(
            input_recids & sort_cache['bucket_data'][bucket_no]
        )
        if len(solution) >= irec_max:
            break

    dict_solution = {}
    missing_records = intbitset()
    for recid in solution:
        try:
            dict_solution[recid] = sort_cache['data_dict_ordered'][recid]
        except KeyError:
            # recid is in buckets, but not in the bsrMETHODDATA,
            # maybe because the value has been deleted, but the change has not
            # yet been propagated to the buckets
            missing_records.add(recid)

    # check if there are recids that are not in any bucket -> to be added at
    # the end/top, ordered by insertion date
    if len(solution) < irec_max:
        # some records have not been yet inserted in the bibsort structures
        # or, some records have no value for the sort_method
        missing_records += input_recids - solution

    reverse = sort_order == 'd'

    if sort_method.strip().lower() == cfg['CFG_BIBSORT_DEFAULT_FIELD'] and \
            reverse:
        # If we want to sort the records on their insertion date, add the
        # missing records at the top.
        solution = sorted(missing_records, reverse=True) + \
            sorted(dict_solution, key=dict_solution.__getitem__, reverse=True)
    else:
        solution = sorted(dict_solution, key=dict_solution.__getitem__,
                          reverse=reverse) + sorted(missing_records)

    # Only keep records, we are going to display
    solution = slice_records(solution, jrec, rg)

    if sort_or_rank == 'r':
        # We need the recids, with their ranking score
        return solution, [dict_solution.get(record, 0) for record in solution]
    else:
        return solution