def perform_external_collection_search(
        req,
        current_collection,
        pattern_list,
        field,
        external_collection,
        verbosity_level=0,
        lang=CFG_SITE_LANG,
        selected_external_collections_infos=None):
    """Search external collection and print the seealso box."""

    vprint = get_verbose_print(req, 'External collection: ', verbosity_level)

    if selected_external_collections_infos:
        (search_engines, seealso_engines, pattern,
         basic_search_units) = selected_external_collections_infos
    else:
        (search_engines, seealso_engines, pattern,
         basic_search_units) = print_external_results_overview(
             req, current_collection, pattern_list, field, external_collection,
             verbosity_level, lang)

    if not pattern:
        return

    do_external_search(req, lang, vprint, basic_search_units, search_engines)
    create_seealso_box(req, lang, vprint, basic_search_units, seealso_engines,
                       pattern)
    vprint(3, 'end')
예제 #2
0
def calculate_hosted_collections_search_params(req,
                                               pattern_list,
                                               field,
                                               hosted_collections,
                                               verbosity_level=0):
    """Calculate the searching parameters for the selected hosted collections
    i.e. the actual hosted search engines and the basic search units"""

    from invenio.search_engine import create_basic_search_units
    assert req
    vprint = get_verbose_print(
        req,
        'Hosted collections (calculate_hosted_collections_search_params): ',
        verbosity_level)

    pattern = bind_patterns(pattern_list)
    vprint(3, 'pattern = %s' % cgi.escape(pattern))

    # if for any strange reason there is no pattern, just return
    # UPDATE : let search go on even there is no pattern (an empty pattern_list and field)
    #if not pattern: return (None, None)

    # calculate the basic search units
    basic_search_units = create_basic_search_units(None, pattern, field)
    vprint(3, 'basic_search_units = %s' % cgi.escape(repr(basic_search_units)))

    # calculate the set of hosted search engines
    hosted_search_engines = select_hosted_search_engines(hosted_collections)
    vprint(3, 'hosted_search_engines = ' + str(hosted_search_engines))

    # no need really to print out a sorted list of the hosted search engines, is there? I'll leave this commented out
    #hosted_search_engines_list = external_collection_sort_engine_by_name(hosted_search_engines)
    #vprint(3, 'hosted_search_engines_list (sorted) : ' + str(hosted_search_engines_list))

    return (hosted_search_engines, basic_search_units)
def print_external_results_overview(req, current_collection, pattern_list, field,
        external_collection, verbosity_level=0, lang=CFG_SITE_LANG):
    """Print the external collection overview box. Return the selected external collections and parsed query"""
    from invenio.search_engine import create_basic_search_units
    assert req
    vprint = get_verbose_print(req, 'External collection (print_external_results_overview): ', verbosity_level)

    pattern = bind_patterns(pattern_list)
    vprint(3, 'pattern = ' + pattern)

    if not pattern:
        return (None, None, None, None)

    basic_search_units = create_basic_search_units(None, pattern, field)
    vprint(3, 'basic_search_units = ' + str(basic_search_units))

    (search_engines, seealso_engines) = select_external_engines(current_collection, external_collection)
    vprint(3, 'search_engines = ' + str(search_engines))
    vprint(3, 'seealso_engines = ' + str(seealso_engines))

    search_engines_list = external_collection_sort_engine_by_name(search_engines)
    vprint(3, 'search_engines_list (sorted) : ' + str(search_engines_list))
    html = template.external_collection_overview(lang, search_engines_list)
    req.write(html)

    return (search_engines, seealso_engines, pattern, basic_search_units)
def calculate_hosted_collections_search_params(req,
                                               pattern_list,
                                               field,
                                               hosted_collections,
                                               verbosity_level=0):
    """Calculate the searching parameters for the selected hosted collections
    i.e. the actual hosted search engines and the basic search units"""

    from invenio.search_engine import create_basic_search_units
    assert req
    vprint = get_verbose_print(req, 'Hosted collections (calculate_hosted_collections_search_params): ', verbosity_level)

    pattern = bind_patterns(pattern_list)
    vprint(3, 'pattern = ' + pattern)

    # if for any strange reason there is no pattern, just return
    # UPDATE : let search go on even there is no pattern (an empty pattern_list and field)
    #if not pattern: return (None, None)

    # calculate the basic search units
    basic_search_units = create_basic_search_units(None, pattern, field)
    vprint(3, 'basic_search_units = ' + str(basic_search_units))

    # calculate the set of hosted search engines
    hosted_search_engines = select_hosted_search_engines(hosted_collections)
    vprint(3, 'hosted_search_engines = ' + str(hosted_search_engines))

    # no need really to print out a sorted list of the hosted search engines, is there? I'll leave this commented out
    #hosted_search_engines_list = external_collection_sort_engine_by_name(hosted_search_engines)
    #vprint(3, 'hosted_search_engines_list (sorted) : ' + str(hosted_search_engines_list))

    return (hosted_search_engines, basic_search_units)
def calculate_hosted_collections_results(req, pattern_list, field, hosted_collections, verbosity_level=0,
                                         lang=CFG_SITE_LANG, timeout=CFG_EXTERNAL_COLLECTION_TIMEOUT):
    """Ruturn a list of the various results for a every hosted collection organized in tuples"""

    # normally, the following should be checked before even running this function so the following line could be removed
    if not hosted_collections: return (None, None)

    vprint = get_verbose_print(req, 'Hosted collections: ', verbosity_level)
    vprint(3, 'pattern_list = ' + str(pattern_list) + ', field = ' + str(field))

    # firstly we calculate the search parameters, i.e. the actual hosted search engines and the basic search units
    (hosted_search_engines, basic_search_units) = \
    calculate_hosted_collections_search_params(req,
                                               pattern_list,
                                               field,
                                               hosted_collections,
                                               verbosity_level)

    # in case something went wrong with the above calculation just return None
    # however, once we run this function no fail should be expected here
    # UPDATE : let search go on even there are no basic search units (an empty pattern_list and field)
    #if basic_search_units == None or len(hosted_search_engines) == 0: return (None, None)
    if len(hosted_search_engines) == 0: return (None, None)

    # finally return the list of tuples with the results
    return do_calculate_hosted_collections_results(req, lang, vprint, verbosity_level, basic_search_units, hosted_search_engines, timeout)

    vprint(3, 'end')
def print_external_results_overview(req, current_collection, pattern_list, field,
        external_collection, verbosity_level=0, lang=CFG_SITE_LANG, print_overview=True):
    """Print the external collection overview box. Return the selected external collections and parsed query"""
    from invenio.search_engine import create_basic_search_units
    assert req
    vprint = get_verbose_print(req, 'External collection (print_external_results_overview): ', verbosity_level)

    pattern = bind_patterns(pattern_list)
    vprint(3, 'pattern = %s' % cgi.escape(pattern))

    if not pattern:
        return (None, None, None, None)

    basic_search_units = create_basic_search_units(None, pattern, field)
    vprint(3, 'basic_search_units = %s' % cgi.escape(repr(basic_search_units)))

    (search_engines, seealso_engines) = select_external_engines(current_collection, external_collection)
    vprint(3, 'search_engines = ' + str(search_engines))
    vprint(3, 'seealso_engines = ' + str(seealso_engines))

    search_engines_list = external_collection_sort_engine_by_name(search_engines)
    vprint(3, 'search_engines_list (sorted) : ' + str(search_engines_list))
    if print_overview:
        html = template.external_collection_overview(lang, search_engines_list)
        req.write(html)

    return (search_engines, seealso_engines, pattern, basic_search_units)
def calculate_hosted_collections_results(req, pattern_list, field, hosted_collections, verbosity_level=0,
                                         lang=CFG_SITE_LANG, timeout=CFG_EXTERNAL_COLLECTION_TIMEOUT):
    """Ruturn a list of the various results for a every hosted collection organized in tuples"""

    # normally, the following should be checked before even running this function so the following line could be removed
    if not hosted_collections: return (None, None)

    vprint = get_verbose_print(req, 'Hosted collections: ', verbosity_level)
    vprint(3, 'pattern_list = %s, field = %s' % (cgi.escape(repr(pattern_list)), cgi.escape(field)))

    # firstly we calculate the search parameters, i.e. the actual hosted search engines and the basic search units
    (hosted_search_engines, basic_search_units) = \
    calculate_hosted_collections_search_params(req,
                                               pattern_list,
                                               field,
                                               hosted_collections,
                                               verbosity_level)

    # in case something went wrong with the above calculation just return None
    # however, once we run this function no fail should be expected here
    # UPDATE : let search go on even there are no basic search units (an empty pattern_list and field)
    #if basic_search_units == None or len(hosted_search_engines) == 0: return (None, None)
    if len(hosted_search_engines) == 0: return (None, None)

    # finally return the list of tuples with the results
    return do_calculate_hosted_collections_results(req, lang, vprint, verbosity_level, basic_search_units, hosted_search_engines, timeout)

    vprint(3, 'end')
def perform_external_collection_search(req, current_collection, pattern_list, field,
        external_collection, verbosity_level=0, lang=CFG_SITE_LANG, selected_external_collections_infos=None):
    """Search external collection and print the seealso box."""

    vprint = get_verbose_print(req, 'External collection: ', verbosity_level)

    if selected_external_collections_infos:
        (search_engines, seealso_engines, pattern, basic_search_units) = selected_external_collections_infos
    else:
        (search_engines, seealso_engines, pattern, basic_search_units) = print_external_results_overview(req,
            current_collection, pattern_list, field, external_collection, verbosity_level, lang)

    if not pattern:
        return

    do_external_search(req, lang, vprint, basic_search_units, search_engines)
    create_seealso_box(req, lang, vprint, basic_search_units, seealso_engines, pattern)
    vprint(3, 'end')
예제 #9
0
def do_calculate_hosted_collections_results(
        req,
        lang,
        vprint,
        verbosity_level,
        basic_search_units,
        hosted_search_engines,
        timeout=CFG_EXTERNAL_COLLECTION_TIMEOUT):
    """Actually search the hosted collections and return their results and information in a list of tuples.
    One tuple for each hosted collection. Handles timeouts"""

    _ = gettext_set_language(lang)
    if not vprint:
        vprint = get_verbose_print(
            req,
            'Hosted collections (calculate_hosted_collections_search_params): ',
            verbosity_level)
        # defining vprint at this moment probably means we'll just run this one function at this time, therefore the "verbose"
        # end hosted search string will not be printed (it is normally printed by the initial calculate function)
        # Therefore, either define a flag here to print it by the end of this function or redefine the whole "verbose"
        # printing logic of the above functions
    vprint(3, 'beginning hosted search')

    # list to hold the hosted search engines and their respective search urls
    engines_list = []
    # list to hold the non timed out results
    results_list = []
    # list to hold all the results
    full_results_list = []
    # list to hold all the timeouts
    timeout_list = []

    # in case this is an engine-only list
    if type(hosted_search_engines) is set:
        for engine in hosted_search_engines:
            url = engine.build_search_url(basic_search_units, req.args, lang)
            user_url = engine.build_user_search_url(basic_search_units,
                                                    req.args, lang)
            if url:
                engines_list.append([url, engine, user_url])
    # in case we are iterating a pre calculated url+engine list
    elif type(hosted_search_engines) is list:
        for engine in hosted_search_engines:
            engines_list.append(engine)
    # in both the above cases we end up with a [[search url], [engine]] kind of list

    # create the list of search urls to be handed to the asynchronous getter
    pagegetters_list = [
        HTTPAsyncPageGetter(engine[0]) for engine in engines_list
    ]

    # function to be run on every result
    def finished(pagegetter, data, current_time):
        """Function called, each time the download of a web page finish.
        Will parse and print the results of this page."""
        # each pagegetter that didn't timeout is added to this list
        results_list.append((pagegetter, data, current_time))

    # run the asynchronous getter
    finished_list = async_download(pagegetters_list, finished, engines_list,
                                   timeout)

    # create the complete list of tuples, one for each hosted collection, with the results and other information,
    # including those that timed out
    for (finished, engine) in zip(finished_list,
                                  engines_list):  #finished_and_engines_list:
        if finished:
            for result in results_list:
                if result[1] == engine:
                    # the engine is fed the results, it will be parsed later, at printing time
                    engine[1].parser.parse_and_get_results(result[0].data,
                                                           feedonly=True)
                    ## the list contains:
                    ## * the engine itself: [ search url], [engine]
                    ## * the parsed number of found results
                    ## * the fetching time
                    full_results_list.append(
                        (engine, engine[1].parser.parse_num_results(),
                         result[2]))
                    break
        elif not finished:
            ## the list contains:
            ## * the engine itself: [search url], [engine]
            timeout_list.append(engine)

    return (full_results_list, timeout_list)
def do_calculate_hosted_collections_results(req, lang, vprint, verbosity_level, basic_search_units, hosted_search_engines,
                                            timeout=CFG_EXTERNAL_COLLECTION_TIMEOUT):
    """Actually search the hosted collections and return their results and information in a list of tuples.
    One tuple for each hosted collection. Handles timeouts"""

    _ = gettext_set_language(lang)
    if not vprint:
        vprint = get_verbose_print(req, 'Hosted collections (calculate_hosted_collections_search_params): ', verbosity_level)
        # defining vprint at this moment probably means we'll just run this one function at this time, therefore the "verbose"
        # end hosted search string will not be printed (it is normally printed by the initial calculate function)
        # Therefore, either define a flag here to print it by the end of this function or redefine the whole "verbose"
        # printing logic of the above functions
    vprint(3, 'beginning hosted search')

    # list to hold the hosted search engines and their respective search urls
    engines_list = []
    # list to hold the non timed out results
    results_list = []
    # list to hold all the results
    full_results_list = []
    # list to hold all the timeouts
    timeout_list = []

    # in case this is an engine-only list
    if type(hosted_search_engines) is set:
        for engine in hosted_search_engines:
            url = engine.build_search_url(basic_search_units, req.args, lang)
            user_url = engine.build_user_search_url(basic_search_units, req.args, lang)
            if url:
                engines_list.append([url, engine, user_url])
    # in case we are iterating a pre calculated url+engine list
    elif type(hosted_search_engines) is list:
        for engine in hosted_search_engines:
            engines_list.append(engine)
    # in both the above cases we end up with a [[search url], [engine]] kind of list

    # create the list of search urls to be handed to the asynchronous getter
    pagegetters_list = [HTTPAsyncPageGetter(engine[0]) for engine in engines_list]

    # function to be run on every result
    def finished(pagegetter, data, current_time):
        """Function called, each time the download of a web page finish.
        Will parse and print the results of this page."""
        # each pagegetter that didn't timeout is added to this list
        results_list.append((pagegetter, data, current_time))

    # run the asynchronous getter
    finished_list = async_download(pagegetters_list, finished, engines_list, timeout)

    # create the complete list of tuples, one for each hosted collection, with the results and other information,
    # including those that timed out
    for (finished, engine) in zip(finished_list, engines_list): #finished_and_engines_list:
        if finished:
            for result in results_list:
                if result[1] == engine:
                    # the engine is fed the results, it will be parsed later, at printing time
                    engine[1].parser.parse_and_get_results(result[0].data, feedonly=True)
                    ## the list contains:
                    ## * the engine itself: [ search url], [engine]
                    ## * the parsed number of found results
                    ## * the fetching time
                    full_results_list.append(
                        (engine, engine[1].parser.parse_num_results(), result[2])
                    )
                    break
        elif not finished:
            ## the list contains:
            ## * the engine itself: [search url], [engine]
            timeout_list.append(engine)

    return (full_results_list, timeout_list)