Пример #1
0
def response(resp):
    results = []

    # First retrieve notice of each result
    pubmed_retrieve_api_url = 'https://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi?'\
                              + 'db=pubmed&retmode=xml&id={pmids_string}'

    pmids_results = etree.XML(resp.content)
    pmids = pmids_results.xpath('//eSearchResult/IdList/Id')
    pmids_string = ''

    for item in pmids:
        pmids_string += item.text + ','

    retrieve_notice_args = dict(pmids_string=pmids_string)

    retrieve_url_encoded = pubmed_retrieve_api_url.format(
        **retrieve_notice_args)

    search_results_xml = get(retrieve_url_encoded).content
    search_results = etree.XML(search_results_xml).xpath(
        '//PubmedArticleSet/PubmedArticle/MedlineCitation')

    for entry in search_results:
        title = entry.xpath('.//Article/ArticleTitle')[0].text

        pmid = entry.xpath('.//PMID')[0].text
        url = pubmed_url + pmid

        try:
            content = entry.xpath('.//Abstract/AbstractText')[0].text
        except:
            content = gettext('No abstract is available for this publication.')

        #  If a doi is available, add it to the snipppet
        try:
            doi = entry.xpath('.//ELocationID[@EIdType="doi"]')[0].text
            content = 'DOI: {doi} Abstract: {content}'.format(doi=doi,
                                                              content=content)
        except:
            pass

        if len(content) > 300:
            content = content[0:300] + "..."
        # TODO: center snippet on query term

        res_dict = {'url': url, 'title': title, 'content': content}

        try:
            publishedDate = datetime.strptime(
                entry.xpath('.//DateCreated/Year')[0].text + '-' +
                entry.xpath('.//DateCreated/Month')[0].text + '-' +
                entry.xpath('.//DateCreated/Day')[0].text, '%Y-%m-%d')
            res_dict['publishedDate'] = publishedDate
        except:
            pass

        results.append(res_dict)

        return results
Пример #2
0
def debug_explain_wikidata_query(query, method='GET'):
    if method == 'GET':
        http_response = get(SPARQL_EXPLAIN_URL + '&' + urlencode({'query': query}), headers=get_headers())
    else:
        http_response = post(SPARQL_EXPLAIN_URL, data={'query': query}, headers=get_headers())
    http_response.raise_for_status()
    return http_response.content
Пример #3
0
def response(resp):
    results = []
    htmlparser = etree.HTMLParser()
    html = fromstring(resp.content.decode("utf-8"), parser=htmlparser)
    search_results = eval_xpath(html, wikidata_ids_xpath)

    if resp.search_params['language'].split('-')[0] == 'all':
        language = 'en'
    else:
        language = match_language(resp.search_params['language'],
                                  supported_languages,
                                  language_aliases).split('-')[0]

    # TODO: make requests asynchronous to avoid timeout when result_count > 1
    for search_result in search_results[:result_count]:
        wikidata_id = search_result.split('/')[-1]
        url = url_detail.format(query=urlencode({
            'page': wikidata_id,
            'uselang': language
        }))
        htmlresponse = get(url)
        jsonresponse = loads(htmlresponse.content.decode("utf-8"))
        results += getDetail(jsonresponse, wikidata_id, language,
                             resp.search_params['language'], htmlparser)

    return results
Пример #4
0
def duckduckgo(query):
    # duckduckgo autocompleter
    url = 'https://ac.duckduckgo.com/ac/?{0}&type=list'

    resp = loads(get(url.format(urlencode(dict(q=query)))).text)
    if len(resp) > 1:
        return resp[1]
    return []
Пример #5
0
def wikipedia(query):
    # wikipedia autocompleter
    url = 'https://en.wikipedia.org/w/api.php?action=opensearch&{0}&limit=10&namespace=0&format=json'  # noqa

    resp = loads(get(url.format(urlencode(dict(search=query)))).text)
    if len(resp) > 1:
        return resp[1]
    return []
Пример #6
0
def response(resp):
    results = []

    # First retrieve notice of each result
    pubmed_retrieve_api_url = 'https://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi?'\
                              + 'db=pubmed&retmode=xml&id={pmids_string}'

    pmids_results = etree.XML(resp.content)
    pmids = pmids_results.xpath('//eSearchResult/IdList/Id')
    pmids_string = ''

    for item in pmids:
        pmids_string += item.text + ','

    retrieve_notice_args = dict(pmids_string=pmids_string)

    retrieve_url_encoded = pubmed_retrieve_api_url.format(**retrieve_notice_args)

    search_results_xml = get(retrieve_url_encoded).content
    search_results = etree.XML(search_results_xml).xpath('//PubmedArticleSet/PubmedArticle/MedlineCitation')

    for entry in search_results:
        title = entry.xpath('.//Article/ArticleTitle')[0].text

        pmid = entry.xpath('.//PMID')[0].text
        url = pubmed_url + pmid

        try:
            content = entry.xpath('.//Abstract/AbstractText')[0].text
        except:
            content = gettext('No abstract is available for this publication.')

        #  If a doi is available, add it to the snipppet
        try:
            doi = entry.xpath('.//ELocationID[@EIdType="doi"]')[0].text
            content = 'DOI: {doi} Abstract: {content}'.format(doi=doi, content=content)
        except:
            pass

        if len(content) > 300:
                    content = content[0:300] + "..."
        # TODO: center snippet on query term

        res_dict = {'url': url,
                    'title': title,
                    'content': content}

        try:
            publishedDate = datetime.strptime(entry.xpath('.//DateCreated/Year')[0].text
                                              + '-' + entry.xpath('.//DateCreated/Month')[0].text
                                              + '-' + entry.xpath('.//DateCreated/Day')[0].text, '%Y-%m-%d')
            res_dict['publishedDate'] = publishedDate
        except:
            pass

        results.append(res_dict)

        return results
Пример #7
0
def get_vqd(query, headers):
    query_url = site_url.format(query=urlencode({'q': query}))
    res = get(query_url, headers=headers)
    content = res.text
    if content.find('vqd=\'') == -1:
        raise Exception('Request failed')
    vqd = content[content.find('vqd=\'') + 5:]
    vqd = vqd[:vqd.find('\'')]
    return vqd
Пример #8
0
def get_vqd(query, headers):
    query_url = site_url.format(query=urlencode({'q': query}))
    res = get(query_url, headers=headers)
    content = res.text
    if content.find('vqd=\'') == -1:
        raise Exception('Request failed')
    vqd = content[content.find('vqd=\'') + 5:]
    vqd = vqd[:vqd.find('\'')]
    return vqd
Пример #9
0
def send_wikidata_query(query, method='GET'):
    if method == 'GET':
        # query will be cached by wikidata
        http_response = get(SPARQL_ENDPOINT_URL + '?' + urlencode({'query': query}), headers=get_headers())
    else:
        # query won't be cached by wikidata
        http_response = post(SPARQL_ENDPOINT_URL, data={'query': query}, headers=get_headers())
    if http_response.status_code != 200:
        logger.debug('SPARQL endpoint error %s', http_response.content.decode())
    logger.debug('request time %s', str(http_response.elapsed))
    http_response.raise_for_status()
    return loads(http_response.content.decode())
Пример #10
0
def request(query, params):
    response_index = get(base_url, headers=params['headers'], raise_for_httperror=True)
    dom = html.fromstring(response_index.text)

    url_params = {'q': query}
    for e in eval_xpath_list(dom, '//input[@type="hidden"]'):
        name = e.get('name')
        value = e.get('value')
        url_params[name] = value

    params['url'] = base_url + '?' + urlencode(url_params)
    params['cookies'] = response_index.cookies
    return params
Пример #11
0
def google(query):
    # google autocompleter
    autocomplete_url = 'http://suggestqueries.google.com/complete/search?client=toolbar&'  # noqa

    response = get(autocomplete_url + urlencode(dict(q=query)))

    results = []

    if response.ok:
        dom = etree.fromstring(response.text)
        results = dom.xpath('//suggestion/@data')

    return results
Пример #12
0
def dbpedia(query):
    # dbpedia autocompleter
    autocomplete_url = 'http://lookup.dbpedia.org/api/search.asmx/KeywordSearch?'  # noqa

    response = get(autocomplete_url + urlencode(dict(QueryString=query)))

    results = []

    if response.ok:
        dom = etree.fromstring(response.content)
        results = dom.xpath('//a:Result/a:Label//text()',
                            namespaces={'a': 'http://lookup.dbpedia.org/'})

    return results
Пример #13
0
def response(resp):
    if resp.status_code == 303:
        return []

    # ping
    headers_ping = dict_subset(resp.request.headers, ['User-Agent', 'Accept-Encoding', 'Accept', 'Cookie'])
    get(url_ping, headers=headers_ping)

    # parse the response
    results = []
    doc = fromstring(resp.text)
    for i, r in enumerate(eval_xpath(doc, result_xpath)):
        if i >= 30:
            break
        try:
            res_url = eval_xpath(r, url_xpath)[-1]
        except:
            continue

        if not res_url:
            continue

        title = extract_text(eval_xpath(r, title_xpath))
        content = extract_text(eval_xpath(r, content_xpath))

        # append result
        results.append({'title': title,
                        'content': content,
                        'url': res_url})

    # parse correction
    for correction in eval_xpath(doc, correction_xpath):
        # append correction
        results.append({'correction': extract_text(correction)})

    # return results
    return results
Пример #14
0
def response(resp):
    results = []
    html = fromstring(resp.text)
    wikidata_ids = html.xpath(wikidata_ids_xpath)

    language = resp.search_params['language'].split('-')[0]

    # TODO: make requests asynchronous to avoid timeout when result_count > 1
    for wikidata_id in wikidata_ids[:result_count]:
        url = url_detail.format(query=urlencode({'page': wikidata_id, 'uselang': language}))
        htmlresponse = get(url)
        jsonresponse = loads(htmlresponse.text)
        results += getDetail(jsonresponse, wikidata_id, language, resp.search_params['language'])

    return results
Пример #15
0
def response(resp):
    results = []
    html = fromstring(resp.text)
    wikidata_ids = html.xpath(wikidata_ids_xpath)

    language = match_language(resp.search_params['language'], supported_languages).split('-')[0]

    # TODO: make requests asynchronous to avoid timeout when result_count > 1
    for wikidata_id in wikidata_ids[:result_count]:
        url = url_detail.format(query=urlencode({'page': wikidata_id, 'uselang': language}))
        htmlresponse = get(url)
        jsonresponse = loads(htmlresponse.text)
        results += getDetail(jsonresponse, wikidata_id, language, resp.search_params['language'])

    return results
Пример #16
0
def _is_url_image(image_url):
    if not isinstance(image_url, str):
        return False

    if image_url.startswith('//'):
        image_url = 'https:' + image_url

    if image_url.startswith('data:'):
        return image_url.startswith('data:image/')

    if not _is_url(image_url):
        return False

    retry = 2

    while retry > 0:
        a = time()
        try:
            poolrequests.set_timeout_for_thread(10.0, time())
            r = poolrequests.get(
                image_url,
                timeout=10.0,
                allow_redirects=True,
                headers={
                    'User-Agent':
                    'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:84.0) Gecko/20100101 Firefox/84.0',
                    'Accept':
                    'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
                    'Accept-Language': 'en-US;q=0.5,en;q=0.3',
                    'Accept-Encoding': 'gzip, deflate, br',
                    'DNT': '1',
                    'Connection': 'keep-alive',
                    'Upgrade-Insecure-Requests': '1',
                    'Sec-GPC': '1',
                    'Cache-Control': 'max-age=0'
                })
            if r.headers["content-type"].startswith('image/'):
                return True
            return False
        except requests.exceptions.Timeout:
            logger.error('Timeout for %s: %i', image_url, int(time() - a))
            retry -= 1
        except requests.exceptions.RequestException:
            logger.exception('Exception for %s', image_url)
            return False
Пример #17
0
def response(resp):
    results = []
    html = fromstring(resp.text)
    search_results = html.xpath(wikidata_ids_xpath)

    if resp.search_params['language'].split('-')[0] == 'all':
        language = 'en'
    else:
        language = match_language(resp.search_params['language'], supported_languages, language_aliases).split('-')[0]

    # TODO: make requests asynchronous to avoid timeout when result_count > 1
    for search_result in search_results[:result_count]:
        wikidata_id = search_result.split('/')[-1]
        url = url_detail.format(query=urlencode({'page': wikidata_id, 'uselang': language}))
        htmlresponse = get(url)
        jsonresponse = loads(htmlresponse.text)
        results += getDetail(jsonresponse, wikidata_id, language, resp.search_params['language'])

    return results
Пример #18
0
def response(resp):
    results = []
    search_res = json.loads(resp.text)

    wikidata_ids = set()
    for r in search_res.get('query', {}).get('search', {}):
        wikidata_ids.add(r.get('title', ''))

    language = resp.search_params['language'].split('_')[0]
    if language == 'all':
        language = 'en'

    url = url_detail.format(query=urlencode({'ids': '|'.join(wikidata_ids),
                                            'languages': language + '|en'}))

    htmlresponse = get(url)
    jsonresponse = json.loads(htmlresponse.content)
    for wikidata_id in wikidata_ids:
        results = results + getDetail(jsonresponse, wikidata_id, language, resp.search_params['language'])

    return results
Пример #19
0
def get_vqd(query):
    res = get(site_url.format(query=urlencode({'q': query})))
    content = res.text
    vqd = content[content.find('vqd=\'') + 5:]
    vqd = vqd[:vqd.find('\'')]
    return vqd
Пример #20
0
def load_engine(engine_data):
    engine_name = engine_data['name']
    if '_' in engine_name:
        logger.error(
            'Engine name contains underscore: "{}"'.format(engine_name))
        sys.exit(1)

    if engine_name.lower() != engine_name:
        logger.warn(
            'Engine name is not lowercase: "{}", converting to lowercase'.
            format(engine_name))
        engine_name = engine_name.lower()
        engine_data['name'] = engine_name

    engine_module = engine_data['engine']

    try:
        engine = load_module(engine_module + '.py', engine_dir)
    except (SyntaxError, KeyboardInterrupt, SystemExit, SystemError,
            ImportError, RuntimeError):
        logger.exception(
            'Fatal exception in engine "{}"'.format(engine_module))
        sys.exit(1)
    except:
        logger.exception('Cannot load engine "{}"'.format(engine_module))
        return None

    for param_name, param_value in engine_data.items():
        if param_name == 'engine':
            pass
        elif param_name == 'categories':
            if param_value == 'none':
                engine.categories = []
            else:
                engine.categories = list(map(str.strip,
                                             param_value.split(',')))
        elif param_name == 'proxies':
            engine.proxies = get_proxy_cycles(param_value)
        else:
            setattr(engine, param_name, param_value)

    for arg_name, arg_value in engine_default_args.items():
        if not hasattr(engine, arg_name):
            setattr(engine, arg_name, arg_value)

    # checking required variables
    for engine_attr in dir(engine):
        if engine_attr.startswith('_'):
            continue
        if engine_attr == 'inactive' and getattr(engine, engine_attr) is True:
            return None
        if getattr(engine, engine_attr) is None:
            logger.error('Missing engine config attribute: "{0}.{1}"'.format(
                engine.name, engine_attr))
            sys.exit(1)

    # assign supported languages from json file
    if engine_data['name'] in ENGINES_LANGUAGES:
        setattr(engine, 'supported_languages',
                ENGINES_LANGUAGES[engine_data['name']])

    # find custom aliases for non standard language codes
    if hasattr(engine, 'supported_languages'):
        if hasattr(engine, 'language_aliases'):
            language_aliases = getattr(engine, 'language_aliases')
        else:
            language_aliases = {}

        for engine_lang in getattr(engine, 'supported_languages'):
            iso_lang = match_language(engine_lang, babel_langs, fallback=None)
            if iso_lang and iso_lang != engine_lang and not engine_lang.startswith(iso_lang) and \
               iso_lang not in getattr(engine, 'supported_languages'):
                language_aliases[iso_lang] = engine_lang

        setattr(engine, 'language_aliases', language_aliases)

    # assign language fetching method if auxiliary method exists
    if hasattr(engine, '_fetch_supported_languages'):
        setattr(
            engine, 'fetch_supported_languages',
            lambda: engine._fetch_supported_languages(
                get(engine.supported_languages_url)))

    engine.stats = {
        'sent_search_count': 0,  # sent search
        'search_count': 0,  # succesful search
        'result_count': 0,
        'engine_time': 0,
        'engine_time_count': 0,
        'score_count': 0,
        'errors': 0
    }

    engine_type = getattr(engine, 'engine_type', 'online')

    if engine_type != 'offline':
        engine.stats['page_load_time'] = 0
        engine.stats['page_load_count'] = 0

    # tor related settings
    if settings['outgoing'].get('using_tor_proxy'):
        # use onion url if using tor.
        if hasattr(engine, 'onion_url'):
            engine.search_url = engine.onion_url + getattr(
                engine, 'search_path', '')
    elif 'onions' in engine.categories:
        # exclude onion engines if not using tor.
        return None

    engine.timeout += settings['outgoing'].get('extra_proxy_timeout', 0)

    for category_name in engine.categories:
        categories.setdefault(category_name, []).append(engine)

    if engine.shortcut in engine_shortcuts:
        logger.error('Engine config error: ambigious shortcut: {0}'.format(
            engine.shortcut))
        sys.exit(1)

    engine_shortcuts[engine.shortcut] = engine.name

    return engine
Пример #21
0
def get_google_nid_cookie(google_hostname):
    global nid_cookie
    if google_hostname not in nid_cookie:
        resp = get('https://' + google_hostname)
        nid_cookie[google_hostname] = resp.cookies.get("NID", None)
    return nid_cookie[google_hostname]
Пример #22
0
def get_google_pref_cookie():
    global pref_cookie
    if pref_cookie == '':
        resp = get('https://www.google.com/ncr', allow_redirects=False)
        pref_cookie = resp.cookies["PREF"]
    return pref_cookie
Пример #23
0
def init(engine_settings=None):  # pylint: disable=unused-argument
    parse_extra_param(get(base_url + extra_param_path).text)
Пример #24
0
def get_google_nid_cookie(google_hostname):
    global nid_cookie
    if google_hostname not in nid_cookie:
        resp = get('https://' + google_hostname)
        nid_cookie[google_hostname] = resp.cookies.get("NID", None)
    return nid_cookie[google_hostname]
Пример #25
0
def get_google_pref_cookie():
    global pref_cookie
    if pref_cookie == '':
        resp = get('https://www.google.com/ncr', allow_redirects=False)
        pref_cookie = resp.cookies["PREF"]
    return pref_cookie
Пример #26
0
def load_engine(engine_data):
    engine_name = engine_data['name']
    if '_' in engine_name:
        logger.error(
            'Engine name contains underscore: "{}"'.format(engine_name))
        sys.exit(1)

    if engine_name.lower() != engine_name:
        logger.warn(
            'Engine name is not lowercase: "{}", converting to lowercase'.
            format(engine_name))
        engine_name = engine_name.lower()
        engine_data['name'] = engine_name

    engine_module = engine_data['engine']

    try:
        engine = load_module(engine_module + '.py', engine_dir)
    except (SyntaxError, KeyboardInterrupt, SystemExit, SystemError,
            ImportError, RuntimeError) as e:
        logger.exception(
            'Fatal exception in engine "{}"'.format(engine_module))
        sys.exit(1)
    except:
        logger.exception('Cannot load engine "{}"'.format(engine_module))
        return None

    for param_name in engine_data:
        if param_name == 'engine':
            continue
        if param_name == 'categories':
            if engine_data['categories'] == 'none':
                engine.categories = []
            else:
                engine.categories = list(
                    map(str.strip, engine_data['categories'].split(',')))
            continue
        setattr(engine, param_name, engine_data[param_name])

    for arg_name, arg_value in engine_default_args.items():
        if not hasattr(engine, arg_name):
            setattr(engine, arg_name, arg_value)

    # checking required variables
    for engine_attr in dir(engine):
        if engine_attr.startswith('_'):
            continue
        if engine_attr == 'inactive' and getattr(engine, engine_attr) is True:
            return None
        if getattr(engine, engine_attr) is None:
            logger.error('Missing engine config attribute: "{0}.{1}"'.format(
                engine.name, engine_attr))
            sys.exit(1)

    # assign supported languages from json file
    if engine_data['name'] in languages:
        setattr(engine, 'supported_languages', languages[engine_data['name']])

    # find custom aliases for non standard language codes
    if hasattr(engine, 'supported_languages'):
        if hasattr(engine, 'language_aliases'):
            language_aliases = getattr(engine, 'language_aliases')
        else:
            language_aliases = {}

        for engine_lang in getattr(engine, 'supported_languages'):
            iso_lang = match_language(engine_lang, babel_langs, fallback=None)
            if iso_lang and iso_lang != engine_lang and not engine_lang.startswith(iso_lang) and \
               iso_lang not in getattr(engine, 'supported_languages'):
                language_aliases[iso_lang] = engine_lang

        setattr(engine, 'language_aliases', language_aliases)

    # assign language fetching method if auxiliary method exists
    if hasattr(engine, '_fetch_supported_languages'):
        setattr(
            engine, 'fetch_supported_languages',
            lambda: engine._fetch_supported_languages(
                get(engine.supported_languages_url)))

    engine.stats = {
        'result_count': 0,
        'search_count': 0,
        'engine_time': 0,
        'engine_time_count': 0,
        'score_count': 0,
        'errors': 0
    }

    if not engine.offline:
        engine.stats['page_load_time'] = 0
        engine.stats['page_load_count'] = 0

    for category_name in engine.categories:
        categories.setdefault(category_name, []).append(engine)

    if engine.shortcut in engine_shortcuts:
        logger.error('Engine config error: ambigious shortcut: {0}'.format(
            engine.shortcut))
        sys.exit(1)

    engine_shortcuts[engine.shortcut] = engine.name

    return engine
Пример #27
0
def get_vqd(query):
    res = get(site_url.format(query=urlencode({'q': query})))
    content = res.text
    vqd = content[content.find('vqd=\'') + 5:]
    vqd = vqd[:vqd.find('\'')]
    return vqd
Пример #28
0
def init(engine_settings=None):
    parse_extra_param(
        get('http://gigablast.com/search?c=main&qlangcountry=en-us&q=south&s=10'
            ).text)