def do_external_search(req, lang, vprint, basic_search_units, search_engines):
    """Make the external search."""
    _ = gettext_set_language(lang)
    vprint(3, 'beginning external search')
    engines_list = []

    for engine in search_engines:
        url = engine.build_search_url(basic_search_units, req.args, lang)
        if url:
            engines_list.append([url, engine])

    pagegetters_list = [
        HTTPAsyncPageGetter(engine[0]) for engine in engines_list
    ]

    def finished(pagegetter, data, current_time):
        """Function called, each time the download of a web page finish.
        Will parse and print the results of this page."""
        print_results(req, lang, pagegetter, data, current_time)

    finished_list = async_download(pagegetters_list, finished, engines_list,
                                   CFG_EXTERNAL_COLLECTION_TIMEOUT)

    for (finished, engine) in zip(finished_list, engines_list):
        if not finished:
            url = engine[0]
            name = engine[1].name
            print_timeout(req, lang, engine[1], name, url)
    def test_async_download(self):
        """websearch_external_collections_getter - asynchronous download"""

        ## Test various cases for the async_download function:
        ##   - test 1 working page: invenio-software.org
        ##   - test 1 unresolvable name: rjfreijoiregjreoijgoirg.fr
        ##   - test 1 bad IP: 1.2.3.4
        ## Return the list of errors.
        checks = [
            {'url': 'http://invenio-software.org', 'content': 'About Invenio'},
            {'url': 'http://rjfreijoiregjreoijgoirg.fr'},
            {'url': 'http://1.2.3.4/'}]

        def cb_finished(pagegetter, check, current_time):
            """Function called when a page is received."""
            is_ok = pagegetter.status is not None

            if 'content' in check and is_ok:
                is_ok = pagegetter.data.find(check['content']) > 0

            check['result'] = is_ok == ('content' in check)

        pagegetters = [HTTPAsyncPageGetter(check['url']) for check in checks]
        finished_list = async_download(pagegetters, cb_finished, checks, 20)

        for (finished, check) in zip(finished_list, checks):
            if not finished:
                check['result'] = 'content' not in check

        errors = [check for check in checks if not check['result']]

        self.assertEqual(errors, [])
    def test_async_download(self):
        """websearch_external_collections_getter - asynchronous download"""

        ## Test various cases for the async_download function:
        ##   - test 1 working page: cdsware
        ##   - test 1 unresolvable name: rjfreijoiregjreoijgoirg.fr
        ##   - test 1 bad IP: 1.2.3.4
        ## Return the list of errors.
        checks = [
            {'url': 'http://cdsware.cern.ch/invenio/index.html', 'content': '<title>CDS Invenio: Overview</title>'},
            {'url': 'http://rjfreijoiregjreoijgoirg.fr'},
            {'url': 'http://1.2.3.4/'} ]

        def finished(pagegetter, check, current_time):
            """Function called when a page is received."""
            is_ok = pagegetter.status is not None

            if check.has_key('content') and is_ok:
                is_ok = pagegetter.data.find(check['content']) > 0

            check['result'] = is_ok == check.has_key('content')

        pagegetters = [HTTPAsyncPageGetter(check['url']) for check in checks]
        finished_list = async_download(pagegetters, finished, checks, 20)

        for (finished, check) in zip(finished_list, checks):
            if not finished:
                check['result'] = not check.has_key('content')

        errors = [check for check in checks if not check['result']]

        self.assertEqual(errors, [])
def do_external_search(req, lang, vprint, basic_search_units, search_engines):
    """Make the external search."""
    _ = gettext_set_language(lang)
    vprint(3, 'beginning external search')
    engines_list = []

    for engine in search_engines:
        url = engine.build_search_url(basic_search_units, req.args, lang)
        user_url = engine.build_user_search_url(basic_search_units, req.args, lang)
        if url:
            engines_list.append([url, engine, user_url])

    pagegetters_list = [HTTPAsyncPageGetter(engine[0]) for engine in engines_list]

    def finished(pagegetter, data, current_time):
        """Function called, each time the download of a web page finish.
        Will parse and print the results of this page."""
        print_results(req, lang, pagegetter, data, current_time)

    finished_list = async_download(pagegetters_list, finished, engines_list, CFG_EXTERNAL_COLLECTION_TIMEOUT)

    for (finished, engine) in zip(finished_list, engines_list):
        if not finished:
            url = engine[0]
            name = engine[1].name
            print_timeout(req, lang, engine[1], name, url)
Exemplo n.º 5
0
    def test_async_download(self):
        """websearch_external_collections_getter - asynchronous download"""

        ## Test various cases for the async_download function:
        ##   - test 1 working page: invenio-software.org
        ##   - test 1 unresolvable name: rjfreijoiregjreoijgoirg.fr
        ##   - test 1 bad IP: 1.2.3.4
        ## Return the list of errors.
        checks = [
            {'url': 'http://invenio-software.org', 'content': 'About Invenio'},
            {'url': 'http://rjfreijoiregjreoijgoirg.fr'},
            {'url': 'http://1.2.3.4/'} ]

        def finished(pagegetter, check, current_time):
            """Function called when a page is received."""
            is_ok = pagegetter.status is not None

            if check.has_key('content') and is_ok:
                is_ok = pagegetter.data.find(check['content']) > 0

            check['result'] = is_ok == check.has_key('content')

        pagegetters = [HTTPAsyncPageGetter(check['url']) for check in checks]
        finished_list = async_download(pagegetters, finished, checks, 20)

        for (finished, check) in zip(finished_list, checks):
            if not finished:
                check['result'] = not check.has_key('content')

        errors = [check for check in checks if not check['result']]

        self.assertEqual(errors, [])
Exemplo n.º 6
0
def do_calculate_external_records(
        req_args,
        basic_search_units,
        external_search_engines,
        timeout=CFG_EXTERNAL_COLLECTION_TIMEOUT,
        limit=CFG_EXTERNAL_COLLECTION_MAXRESULTS_ALERTS):
    """Function that returns the external records found and the potential time outs
    given the basic search units or the req arguments and a set of hosted collections engines."""

    # list to hold the hosted search engines and their respective search urls
    engines_list = []
    # list to hold the non timed out results
    results_list = []
    # list to hold all the results
    full_results_list = []
    # list to hold all the timeouts
    timeout_list = []

    for engine in external_search_engines:
        url = engine.build_search_url(basic_search_units,
                                      req_args,
                                      limit=limit)
        if url:
            engines_list.append([url, engine])
    # we end up with a [[search url], [engine]] kind of list

    # create the list of search urls to be handed to the asynchronous getter
    pagegetters_list = [
        HTTPAsyncPageGetter(engine[0]) for engine in engines_list
    ]

    # function to be run on every result
    def finished(pagegetter, data, dummy_time):
        """Function called, each time the download of a web page finish.
        Will parse and print the results of this page."""
        # each pagegetter that didn't timeout is added to this list
        results_list.append((pagegetter, data))

    # run the asynchronous getter
    finished_list = async_download(pagegetters_list, finished, engines_list,
                                   timeout)

    # create the complete list of tuples, one for each hosted collection, with the results and other information,
    # including those that timed out
    for (finished, engine) in zip(finished_list,
                                  engines_list):  #finished_and_engines_list:
        if finished:
            for result in results_list:
                if result[1] == engine:
                    engine[1].parser.parse_and_get_results(result[0].data,
                                                           feedonly=True)
                    full_results_list.append(
                        (engine[1].name,
                         engine[1].parser.parse_and_extract_records(of="xm")))
                    break
        elif not finished:
            timeout_list.append(engine[1].name)

    return (full_results_list, timeout_list)
Exemplo n.º 7
0
def do_calculate_external_records(
    req_args,
    basic_search_units,
    external_search_engines,
    timeout=CFG_EXTERNAL_COLLECTION_TIMEOUT,
    limit=CFG_EXTERNAL_COLLECTION_MAXRESULTS_ALERTS,
):
    """Function that returns the external records found and the potential time outs
    given the basic search units or the req arguments and a set of hosted collections engines."""

    # list to hold the hosted search engines and their respective search urls
    engines_list = []
    # list to hold the non timed out results
    results_list = []
    # list to hold all the results
    full_results_list = []
    # list to hold all the timeouts
    timeout_list = []

    for engine in external_search_engines:
        url = engine.build_search_url(basic_search_units, req_args, limit=limit)
        if url:
            engines_list.append([url, engine])
    # we end up with a [[search url], [engine]] kind of list

    # create the list of search urls to be handed to the asynchronous getter
    pagegetters_list = [HTTPAsyncPageGetter(engine[0]) for engine in engines_list]

    # function to be run on every result
    def finished(pagegetter, data, dummy_time):
        """Function called, each time the download of a web page finish.
        Will parse and print the results of this page."""
        # each pagegetter that didn't timeout is added to this list
        results_list.append((pagegetter, data))

    # run the asynchronous getter
    finished_list = async_download(pagegetters_list, finished, engines_list, timeout)

    # create the complete list of tuples, one for each hosted collection, with the results and other information,
    # including those that timed out
    for (finished, engine) in zip(finished_list, engines_list):  # finished_and_engines_list:
        if finished:
            for result in results_list:
                if result[1] == engine:
                    engine[1].parser.parse_and_get_results(result[0].data, feedonly=True)
                    full_results_list.append((engine[1].name, engine[1].parser.parse_and_extract_records(of="xm")))
                    break
        elif not finished:
            timeout_list.append(engine[1].name)

    return (full_results_list, timeout_list)
Exemplo n.º 8
0
def download_and_parse():
    """Try to make a query that always return results on all search engines.
    Check that a page is well returned and that the result can be parsed.

    This test is not included in the general test suite.

    This test give false positive if any of the external server is non working or too slow.
    """
    test = [['+', 'ieee', '', 'w']]
    errors = []

    external_collections = external_collections_dictionary.values()
    urls = [engine.build_search_url(test) for engine in external_collections]
    pagegetters = [HTTPAsyncPageGetter(url) for url in urls]
    dummy = async_download(pagegetters, None, None, 30)

    for (page, engine, url) in zip(pagegetters, external_collections, urls):
        if not url:
            errors.append("Unable to build url for : " + engine.name)
            continue
        if len(page.data) == 0:
            errors.append("Zero sized page with : " + engine.name)
            continue
        if engine.parser:
            results = engine.parser.parse_and_get_results(page.data)
            num_results = engine.parser.parse_num_results()
            if len(results) == 0:
                errors.append("Unable to parse results for : " + engine.name)
                continue
            if not num_results:
                errors.append(
                    "Unable to parse (None returned) number of results for : "
                    + engine.name)
            try:
                num_results = int(num_results)
            except:
                errors.append(
                    "Unable to parse (not a number) number of results for : " +
                    engine.name)

    return errors
def download_and_parse():
    """Try to make a query that always return results on all search engines.
    Check that a page is well returned and that the result can be parsed.

    This test is not included in the general test suite.

    This test give false positive if any of the external server is non working or too slow.
    """
    test = [['+', 'ieee', '', 'w']]
    errors = []

    external_collections = external_collections_dictionary.values()
    urls = [engine.build_search_url(test) for engine in external_collections]
    pagegetters = [HTTPAsyncPageGetter(url) for url in urls]
    dummy = async_download(pagegetters, None, None, 30)

    for (page, engine, url) in zip(pagegetters, external_collections, urls):
        if not url:
            errors.append("Unable to build url for : " + engine.name)
            continue
        if len(page.data) == 0:
            errors.append("Zero sized page with : " + engine.name)
            continue
        if engine.parser:
            results = engine.parser.parse_and_get_results(page.data)
            num_results = engine.parser.parse_num_results()
            if len(results) == 0:
                errors.append("Unable to parse results for : " + engine.name)
                continue
            if not num_results:
                errors.append("Unable to parse (None returned) number of results for : " + engine.name)
            try:
                num_results = int(num_results)
            except:
                errors.append("Unable to parse (not a number) number of results for : " + engine.name)

    return errors
Exemplo n.º 10
0
def do_calculate_hosted_collections_results(
        req,
        lang,
        vprint,
        verbosity_level,
        basic_search_units,
        hosted_search_engines,
        timeout=CFG_EXTERNAL_COLLECTION_TIMEOUT):
    """Actually search the hosted collections and return their results and information in a list of tuples.
    One tuple for each hosted collection. Handles timeouts"""

    _ = gettext_set_language(lang)
    if not vprint:
        vprint = get_verbose_print(
            req,
            'Hosted collections (calculate_hosted_collections_search_params): ',
            verbosity_level)
        # defining vprint at this moment probably means we'll just run this one function at this time, therefore the "verbose"
        # end hosted search string will not be printed (it is normally printed by the initial calculate function)
        # Therefore, either define a flag here to print it by the end of this function or redefine the whole "verbose"
        # printing logic of the above functions
    vprint(3, 'beginning hosted search')

    # list to hold the hosted search engines and their respective search urls
    engines_list = []
    # list to hold the non timed out results
    results_list = []
    # list to hold all the results
    full_results_list = []
    # list to hold all the timeouts
    timeout_list = []

    # in case this is an engine-only list
    if type(hosted_search_engines) is set:
        for engine in hosted_search_engines:
            url = engine.build_search_url(basic_search_units, req.args, lang)
            user_url = engine.build_user_search_url(basic_search_units,
                                                    req.args, lang)
            if url:
                engines_list.append([url, engine, user_url])
    # in case we are iterating a pre calculated url+engine list
    elif type(hosted_search_engines) is list:
        for engine in hosted_search_engines:
            engines_list.append(engine)
    # in both the above cases we end up with a [[search url], [engine]] kind of list

    # create the list of search urls to be handed to the asynchronous getter
    pagegetters_list = [
        HTTPAsyncPageGetter(engine[0]) for engine in engines_list
    ]

    # function to be run on every result
    def finished(pagegetter, data, current_time):
        """Function called, each time the download of a web page finish.
        Will parse and print the results of this page."""
        # each pagegetter that didn't timeout is added to this list
        results_list.append((pagegetter, data, current_time))

    # run the asynchronous getter
    finished_list = async_download(pagegetters_list, finished, engines_list,
                                   timeout)

    # create the complete list of tuples, one for each hosted collection, with the results and other information,
    # including those that timed out
    for (finished, engine) in zip(finished_list,
                                  engines_list):  #finished_and_engines_list:
        if finished:
            for result in results_list:
                if result[1] == engine:
                    # the engine is fed the results, it will be parsed later, at printing time
                    engine[1].parser.parse_and_get_results(result[0].data,
                                                           feedonly=True)
                    ## the list contains:
                    ## * the engine itself: [ search url], [engine]
                    ## * the parsed number of found results
                    ## * the fetching time
                    full_results_list.append(
                        (engine, engine[1].parser.parse_num_results(),
                         result[2]))
                    break
        elif not finished:
            ## the list contains:
            ## * the engine itself: [search url], [engine]
            timeout_list.append(engine)

    return (full_results_list, timeout_list)
def do_calculate_hosted_collections_results(req, lang, vprint, verbosity_level, basic_search_units, hosted_search_engines,
                                            timeout=CFG_EXTERNAL_COLLECTION_TIMEOUT):
    """Actually search the hosted collections and return their results and information in a list of tuples.
    One tuple for each hosted collection. Handles timeouts"""

    _ = gettext_set_language(lang)
    if not vprint:
        vprint = get_verbose_print(req, 'Hosted collections (calculate_hosted_collections_search_params): ', verbosity_level)
        # defining vprint at this moment probably means we'll just run this one function at this time, therefore the "verbose"
        # end hosted search string will not be printed (it is normally printed by the initial calculate function)
        # Therefore, either define a flag here to print it by the end of this function or redefine the whole "verbose"
        # printing logic of the above functions
    vprint(3, 'beginning hosted search')

    # list to hold the hosted search engines and their respective search urls
    engines_list = []
    # list to hold the non timed out results
    results_list = []
    # list to hold all the results
    full_results_list = []
    # list to hold all the timeouts
    timeout_list = []

    # in case this is an engine-only list
    if type(hosted_search_engines) is set:
        for engine in hosted_search_engines:
            url = engine.build_search_url(basic_search_units, req.args, lang)
            user_url = engine.build_user_search_url(basic_search_units, req.args, lang)
            if url:
                engines_list.append([url, engine, user_url])
    # in case we are iterating a pre calculated url+engine list
    elif type(hosted_search_engines) is list:
        for engine in hosted_search_engines:
            engines_list.append(engine)
    # in both the above cases we end up with a [[search url], [engine]] kind of list

    # create the list of search urls to be handed to the asynchronous getter
    pagegetters_list = [HTTPAsyncPageGetter(engine[0]) for engine in engines_list]

    # function to be run on every result
    def finished(pagegetter, data, current_time):
        """Function called, each time the download of a web page finish.
        Will parse and print the results of this page."""
        # each pagegetter that didn't timeout is added to this list
        results_list.append((pagegetter, data, current_time))

    # run the asynchronous getter
    finished_list = async_download(pagegetters_list, finished, engines_list, timeout)

    # create the complete list of tuples, one for each hosted collection, with the results and other information,
    # including those that timed out
    for (finished, engine) in zip(finished_list, engines_list): #finished_and_engines_list:
        if finished:
            for result in results_list:
                if result[1] == engine:
                    # the engine is fed the results, it will be parsed later, at printing time
                    engine[1].parser.parse_and_get_results(result[0].data, feedonly=True)
                    ## the list contains:
                    ## * the engine itself: [ search url], [engine]
                    ## * the parsed number of found results
                    ## * the fetching time
                    full_results_list.append(
                        (engine, engine[1].parser.parse_num_results(), result[2])
                    )
                    break
        elif not finished:
            ## the list contains:
            ## * the engine itself: [search url], [engine]
            timeout_list.append(engine)

    return (full_results_list, timeout_list)