Python extract_url示例，searx.engines.xpath.extract_url Python示例

示例#1

0

显示文件

文件： yahoo.py 项目： Reventl0v/searx

def response(resp):
    results = []

    dom = html.fromstring(resp.text)

    # parse results
    for result in dom.xpath(results_xpath):
        try:
            url = parse_url(extract_url(result.xpath(url_xpath), search_url))
            title = extract_text(result.xpath(title_xpath)[0])
        except:
            continue

        content = extract_text(result.xpath(content_xpath)[0])

        # append result
        results.append({'url': url, 
                        'title': title, 
                        'content': content})

    # if no suggestion found, return results
    if not suggestion_xpath:
        return results

    # parse suggestion
    for suggestion in dom.xpath(suggestion_xpath):
        # append suggestion
        results.append({'suggestion': extract_text(suggestion)})

    # return results
    return results

示例#2

0

显示文件

文件： google.py 项目： Reventl0v/searx

def response(resp):
    results = []

    dom = html.fromstring(resp.text)

    # parse results
    for result in dom.xpath(results_xpath):
        title = extract_text(result.xpath(title_xpath)[0])
        try:
            url = parse_url(extract_url(result.xpath(url_xpath), search_url))
            parsed_url = urlparse(url)
            if parsed_url.netloc==google_hostname and parsed_url.path==search_path:
                # remove the link to google news
                continue

            if parsed_url.netloc==google_hostname and parsed_url.path==images_path:
                # images result
                results = results + parse_images(result)
            else:
                # normal result
                content = extract_text(result.xpath(content_xpath)[0])
                # append result
                results.append({'url': url, 
                                'title': title, 
                                'content': content})
        except:
            continue

    # parse suggestion
    for suggestion in dom.xpath(suggestion_xpath):
        # append suggestion
        results.append({'suggestion': extract_text(suggestion)})

    # return results
    return results

示例#3

0

显示文件

文件： yahoo.py 项目： ype/searx

def response(resp):
    results = []

    dom = html.fromstring(resp.text)

    # parse results
    for result in dom.xpath(results_xpath):
        try:
            url = parse_url(extract_url(result.xpath(url_xpath), search_url))
            title = extract_text(result.xpath(title_xpath)[0])
        except:
            continue

        content = extract_text(result.xpath(content_xpath)[0])

        # append result
        results.append({'url': url, 'title': title, 'content': content})

    # if no suggestion found, return results
    suggestions = dom.xpath(suggestion_xpath)
    if not suggestions:
        return results

    # parse suggestion
    for suggestion in suggestions:
        # append suggestion
        results.append({'suggestion': extract_text(suggestion)})

    # return results
    return results

示例#4

0

显示文件

def response(resp):
    results = []

    dom = html.fromstring(resp.text)

    # parse results
    for result in dom.xpath(results_xpath):
        urls = result.xpath(url_xpath)
        if len(urls) != 1:
            continue
        url = sanitize_url(parse_url(extract_url(urls, search_url)))
        title = extract_text(result.xpath(title_xpath)[0])
        content = extract_text(result.xpath(content_xpath)[0])

        # parse publishedDate
        publishedDate = extract_text(result.xpath(publishedDate_xpath)[0])

        # still useful ?
        if re.match("^[0-9]+ minute(s|) ago$", publishedDate):
            publishedDate = datetime.now() - timedelta(
                minutes=int(re.match(r'\d+', publishedDate).group()))
        elif re.match("^[0-9]+ days? ago$", publishedDate):
            publishedDate = datetime.now() - timedelta(
                days=int(re.match(r'\d+', publishedDate).group()))
        elif re.match("^[0-9]+ hour(s|), [0-9]+ minute(s|) ago$",
                      publishedDate):
            timeNumbers = re.findall(r'\d+', publishedDate)
            publishedDate = datetime.now()\
                - timedelta(hours=int(timeNumbers[0]))\
                - timedelta(minutes=int(timeNumbers[1]))
        else:
            try:
                publishedDate = parser.parse(publishedDate)
            except:
                publishedDate = datetime.now()

        if publishedDate.year == 1900:
            publishedDate = publishedDate.replace(year=datetime.now().year)

        # append result
        results.append({
            'url': url,
            'title': title,
            'content': content,
            'publishedDate': publishedDate
        })

    # return results
    return results

示例#5

0

显示文件

文件： yahoo_news.py 项目： germc/searx

def response(resp):
    results = []
    dom = html.fromstring(resp.text)

    for result in dom.xpath(results_xpath):
        url = parse_url(extract_url(result.xpath(url_xpath), search_url))
        title = extract_text(result.xpath(title_xpath)[0])
        content = extract_text(result.xpath(content_xpath)[0])
        results.append({'url': url, 'title': title, 'content': content})

    if not suggestion_xpath:
        return results

    for suggestion in dom.xpath(suggestion_xpath):
        results.append({'suggestion': extract_text(suggestion)})

    return results

示例#6

0

显示文件

文件： yahoo_news.py 项目： asciimoo/searx

def response(resp):
    results = []

    dom = html.fromstring(resp.text)

    # parse results
    for result in dom.xpath(results_xpath):
        urls = result.xpath(url_xpath)
        if len(urls) != 1:
            continue
        url = sanitize_url(parse_url(extract_url(urls, search_url)))
        title = extract_text(result.xpath(title_xpath)[0])
        content = extract_text(result.xpath(content_xpath)[0])

        # parse publishedDate
        publishedDate = extract_text(result.xpath(publishedDate_xpath)[0])

        # still useful ?
        if re.match("^[0-9]+ minute(s|) ago$", publishedDate):
            publishedDate = datetime.now() - timedelta(minutes=int(re.match(r'\d+', publishedDate).group()))
        elif re.match("^[0-9]+ days? ago$", publishedDate):
            publishedDate = datetime.now() - timedelta(days=int(re.match(r'\d+', publishedDate).group()))
        elif re.match("^[0-9]+ hour(s|), [0-9]+ minute(s|) ago$", publishedDate):
            timeNumbers = re.findall(r'\d+', publishedDate)
            publishedDate = datetime.now()\
                - timedelta(hours=int(timeNumbers[0]))\
                - timedelta(minutes=int(timeNumbers[1]))
        else:
            try:
                publishedDate = parser.parse(publishedDate)
            except:
                publishedDate = datetime.now()

        if publishedDate.year == 1900:
            publishedDate = publishedDate.replace(year=datetime.now().year)

        # append result
        results.append({'url': url,
                        'title': title,
                        'content': content,
                        'publishedDate': publishedDate})

    # return results
    return results

示例#7

0

显示文件

def response(resp):
    results = []

    dom = html.fromstring(resp.text)

    # parse results
    for result in dom.xpath(results_xpath):
        title = extract_text(result.xpath(title_xpath)[0])
        try:
            url = parse_url(extract_url(result.xpath(url_xpath), search_url))
            parsed_url = urlparse(url)
            if (parsed_url.netloc == google_hostname
                    and parsed_url.path == search_path):
                # remove the link to google news
                continue

            # images result
            if (parsed_url.netloc == google_hostname
                    and parsed_url.path == images_path):
                # only thumbnail image provided,
                # so skipping image results
                # results = results + parse_images(result)
                pass
            else:
                # normal result
                content = extract_text(result.xpath(content_xpath)[0])
                # append result
                results.append({
                    'url': url,
                    'title': title,
                    'content': content
                })
        except:
            continue

    # parse suggestion
    for suggestion in dom.xpath(suggestion_xpath):
        # append suggestion
        results.append({'suggestion': extract_text(suggestion)})

    # return results
    return results

示例#8

0

显示文件

def response(resp):
    results = []
    dom = fromstring(resp.text)

    # trim results so there's not way too many at once
    first_result_index = page_size * (resp.search_params.get('pageno', 1) - 1)
    all_results = eval_xpath_list(dom, results_xpath)
    trimmed_results = all_results[first_result_index:first_result_index +
                                  page_size]

    # get results
    for result in trimmed_results:
        # remove ahmia url and extract the actual url for the result
        raw_url = extract_url(eval_xpath_list(result, url_xpath, min_len=1),
                              search_url)
        cleaned_url = parse_qs(urlparse(raw_url).query).get(
            'redirect_url', [''])[0]

        title = extract_text(eval_xpath(result, title_xpath))
        content = extract_text(eval_xpath(result, content_xpath))

        results.append({
            'url': cleaned_url,
            'title': title,
            'content': content,
            'is_onion': True
        })

    # get spelling corrections
    for correction in eval_xpath_list(dom, correction_xpath):
        results.append({'correction': extract_text(correction)})

    # get number of results
    number_of_results = eval_xpath(dom, number_of_results_xpath)
    if number_of_results:
        try:
            results.append(
                {'number_of_results': int(extract_text(number_of_results))})
        except:
            pass

    return results

示例#9

0

显示文件

def response(resp):
    results = []

    dom = html.fromstring(resp.text)

    try:
        results_num = int(
            eval_xpath(dom,
                       '//div[@class="compPagination"]/span[last()]/text()')
            [0].split()[0].replace(',', ''))
        results.append({'number_of_results': results_num})
    except:
        pass

    # parse results
    for result in eval_xpath(dom, results_xpath):
        try:
            url = parse_url(
                extract_url(eval_xpath(result, url_xpath), search_url))
            title = extract_text(eval_xpath(result, title_xpath)[0])
        except:
            continue

        content = extract_text(eval_xpath(result, content_xpath)[0])

        # append result
        results.append({'url': url, 'title': title, 'content': content})

    # if no suggestion found, return results
    suggestions = eval_xpath(dom, suggestion_xpath)
    if not suggestions:
        return results

    # parse suggestion
    for suggestion in suggestions:
        # append suggestion
        results.append({'suggestion': extract_text(suggestion)})

    # return results
    return results

示例#10

0

显示文件

文件： yahoo.py 项目： LuccoJ/searx

def response(resp):
    results = []

    dom = html.fromstring(resp.text)

    try:
        results_num = int(dom.xpath('//div[@class="compPagination"]/span[last()]/text()')[0]
                          .split()[0].replace(',', ''))
        results.append({'number_of_results': results_num})
    except:
        pass

    # parse results
    for result in dom.xpath(results_xpath):
        try:
            url = parse_url(extract_url(result.xpath(url_xpath), search_url))
            title = extract_text(result.xpath(title_xpath)[0])
        except:
            continue

        content = extract_text(result.xpath(content_xpath)[0])

        # append result
        results.append({'url': url,
                        'title': title,
                        'content': content})

    # if no suggestion found, return results
    suggestions = dom.xpath(suggestion_xpath)
    if not suggestions:
        return results

    # parse suggestion
    for suggestion in suggestions:
        # append suggestion
        results.append({'suggestion': extract_text(suggestion)})

    # return results
    return results

示例#11

0

显示文件

文件： yahoo_news.py 项目： yuvadm/searx

def response(resp):
    results = []
    dom = html.fromstring(resp.text)

    for result in dom.xpath(results_xpath):
        url = parse_url(extract_url(result.xpath(url_xpath), search_url))
        title = extract_text(result.xpath(title_xpath)[0])
        content = extract_text(result.xpath(content_xpath)[0])
        publishedDate = extract_text(result.xpath(publishedDate_xpath)[0])

        if re.match("^[0-9]+ minute(s|) ago$", publishedDate):
            publishedDate = datetime.now() - timedelta(minutes=int(re.match(r'\d+', publishedDate).group()))  # noqa
        else:
            if re.match("^[0-9]+ hour(s|), [0-9]+ minute(s|) ago$",
                        publishedDate):
                timeNumbers = re.findall(r'\d+', publishedDate)
                publishedDate = datetime.now()\
                    - timedelta(hours=int(timeNumbers[0]))\
                    - timedelta(minutes=int(timeNumbers[1]))
            else:
                # TODO year in string possible?
                publishedDate = datetime.strptime(publishedDate,
                                                  "%b %d %H:%M%p")

        if publishedDate.year == 1900:
            publishedDate = publishedDate.replace(year=datetime.now().year)

        results.append({'url': url,
                        'title': title,
                        'content': content,
                        'publishedDate': publishedDate})

    if not suggestion_xpath:
        return results

    for suggestion in dom.xpath(suggestion_xpath):
        results.append({'suggestion': extract_text(suggestion)})

    return results

示例#12

0

显示文件

文件： yahoo_news.py 项目： julienmalik/searx_ynh

def response(resp):
    results = []
    dom = html.fromstring(resp.text)

    for result in dom.xpath(results_xpath):
        url = parse_url(extract_url(result.xpath(url_xpath), search_url))
        title = extract_text(result.xpath(title_xpath)[0])
        content = extract_text(result.xpath(content_xpath)[0])
        publishedDate = extract_text(result.xpath(publishedDate_xpath)[0])

        if re.match("^[0-9]+ minute(s|) ago$", publishedDate):
            publishedDate = datetime.now() - timedelta(minutes=int(re.match(r'\d+', publishedDate).group()))  # noqa
        else:
            if re.match("^[0-9]+ hour(s|), [0-9]+ minute(s|) ago$",
                        publishedDate):
                timeNumbers = re.findall(r'\d+', publishedDate)
                publishedDate = datetime.now()\
                    - timedelta(hours=int(timeNumbers[0]))\
                    - timedelta(minutes=int(timeNumbers[1]))
            else:
                publishedDate = parser.parse(publishedDate)

        if publishedDate.year == 1900:
            publishedDate = publishedDate.replace(year=datetime.now().year)

        results.append({'url': url,
                        'title': title,
                        'content': content,
                        'publishedDate': publishedDate})

    if not suggestion_xpath:
        return results

    for suggestion in dom.xpath(suggestion_xpath):
        results.append({'suggestion': extract_text(suggestion)})

    return results

示例#13

0

显示文件

文件： google.py 项目： asciimoo/searx

def response(resp):
    results = []

    # detect google sorry
    resp_url = urlparse(resp.url)
    if resp_url.netloc == 'sorry.google.com' or resp_url.path == '/sorry/IndexRedirect':
        raise RuntimeWarning('sorry.google.com')

    if resp_url.path.startswith('/sorry'):
        raise RuntimeWarning(gettext('CAPTCHA required'))

    # which hostname ?
    google_hostname = resp.search_params.get('google_hostname')
    google_url = "https://" + google_hostname

    # convert the text to dom
    dom = html.fromstring(resp.text)

    instant_answer = dom.xpath('//div[@id="_vBb"]//text()')
    if instant_answer:
        results.append({'answer': u' '.join(instant_answer)})
    try:
        results_num = int(dom.xpath('//div[@id="resultStats"]//text()')[0]
                          .split()[1].replace(',', ''))
        results.append({'number_of_results': results_num})
    except:
        pass

    # parse results
    for result in dom.xpath(results_xpath):
        try:
            title = extract_text(result.xpath(title_xpath)[0])
            url = parse_url(extract_url(result.xpath(url_xpath), google_url), google_hostname)
            parsed_url = urlparse(url, google_hostname)

            # map result
            if parsed_url.netloc == google_hostname:
                # TODO fix inside links
                continue
                # if parsed_url.path.startswith(maps_path) or parsed_url.netloc.startswith(map_hostname_start):
                #     print "yooooo"*30
                #     x = result.xpath(map_near)
                #     if len(x) > 0:
                #         # map : near the location
                #         results = results + parse_map_near(parsed_url, x, google_hostname)
                #     else:
                #         # map : detail about a location
                #         results = results + parse_map_detail(parsed_url, result, google_hostname)
                # # google news
                # elif parsed_url.path == search_path:
                #     # skipping news results
                #     pass

                # # images result
                # elif parsed_url.path == images_path:
                #     # only thumbnail image provided,
                #     # so skipping image results
                #     # results = results + parse_images(result, google_hostname)
                #     pass

            else:
                # normal result
                content = extract_text_from_dom(result, content_xpath)
                if content is None:
                    continue
                content_misc = extract_text_from_dom(result, content_misc_xpath)
                if content_misc is not None:
                    content = content_misc + "<br />" + content
                # append result
                results.append({'url': url,
                                'title': title,
                                'content': content
                                })
        except:
            logger.debug('result parse error in:\n%s', etree.tostring(result, pretty_print=True))
            continue

    # parse suggestion
    for suggestion in dom.xpath(suggestion_xpath):
        # append suggestion
        results.append({'suggestion': extract_text(suggestion)})

    for correction in dom.xpath(spelling_suggestion_xpath):
        results.append({'correction': extract_text(correction)})

    # return results
    return results

示例#14

0

显示文件

def response(resp):
    results = []

    # detect google sorry
    resp_url = urlparse(resp.url)
    if resp_url.netloc == 'sorry.google.com' or resp_url.path == '/sorry/IndexRedirect':
        raise RuntimeWarning('sorry.google.com')

    # which hostname ?
    google_hostname = resp.search_params.get('google_hostname')
    google_url = "https://" + google_hostname

    # convert the text to dom
    dom = html.fromstring(resp.text)

    # parse results
    for result in dom.xpath(results_xpath):
        try:
            title = extract_text(result.xpath(title_xpath)[0])
            url = parse_url(extract_url(result.xpath(url_xpath), google_url), google_hostname)
            parsed_url = urlparse(url, google_hostname)

            # map result
            if parsed_url.netloc == google_hostname:
                # TODO fix inside links
                continue
                # if parsed_url.path.startswith(maps_path) or parsed_url.netloc.startswith(map_hostname_start):
                #     print "yooooo"*30
                #     x = result.xpath(map_near)
                #     if len(x) > 0:
                #         # map : near the location
                #         results = results + parse_map_near(parsed_url, x, google_hostname)
                #     else:
                #         # map : detail about a location
                #         results = results + parse_map_detail(parsed_url, result, google_hostname)
                # # google news
                # elif parsed_url.path == search_path:
                #     # skipping news results
                #     pass

                # # images result
                # elif parsed_url.path == images_path:
                #     # only thumbnail image provided,
                #     # so skipping image results
                #     # results = results + parse_images(result, google_hostname)
                #     pass

            else:
                # normal result
                content = extract_text_from_dom(result, content_xpath)
                if content is None:
                    continue
                content_misc = extract_text_from_dom(result, content_misc_xpath)
                if content_misc is not None:
                    content = content_misc + "<br />" + content
                # append result
                results.append({'url': url,
                                'title': title,
                                'content': content
                                })
        except:
            logger.debug('result parse error in:\n%s', etree.tostring(result, pretty_print=True))
            continue

    # parse suggestion
    for suggestion in dom.xpath(suggestion_xpath):
        # append suggestion
        results.append({'suggestion': escape(extract_text(suggestion))})

    # return results
    return results

示例#15

0

显示文件

def response(resp):
    results = []

    # detect google sorry
    resp_url = urlparse(resp.url)
    if resp_url.netloc == 'sorry.google.com' or resp_url.path == '/sorry/IndexRedirect':
        raise RuntimeWarning('sorry.google.com')

    if resp_url.path.startswith('/sorry'):
        raise RuntimeWarning(gettext('CAPTCHA required'))

    # which hostname ?
    google_hostname = resp.search_params.get('google_hostname')
    google_url = "https://" + google_hostname

    # convert the text to dom
    dom = html.fromstring(resp.text)

    instant_answer = dom.xpath('//div[contains(@id, "ires")]//div[contains(@class, "hp-xpdbox")]')
    if instant_answer:
        answer_re = r'(?P<prefix><a\s+href=")\/url\?q=(?P<url>[^"]+?)\&amp\;[^"]*(?P<suffix>"\s*>)'
        answer_subst = "\\g<prefix>\\g<url>\\g<suffix>"
        answer_html = ['<br>']
        for element in instant_answer:
            answer_html.append(etree.tostring(element, method="html"))
        answer_str = u' '.join(answer_html)
        answer_fixed = re.sub(answer_re, answer_subst, answer_str, 0, re.MULTILINE)
        results.append({'answer': answer_fixed})

    try:
        results_num = int(dom.xpath('//div[@id="resultStats"]//text()')[0]
                          .split()[1].replace(',', ''))
        results.append({'number_of_results': results_num})
    except:
        pass

    # parse results
    for result in dom.xpath(results_xpath):
        try:
            title = extract_text(result.xpath(title_xpath)[0])
            url = parse_url(extract_url(result.xpath(url_xpath), google_url), google_hostname)
            parsed_url = urlparse(url, google_hostname)

            # map result
            if parsed_url.netloc == google_hostname:
                # TODO fix inside links
                continue
                # if parsed_url.path.startswith(maps_path) or parsed_url.netloc.startswith(map_hostname_start):
                #     print "yooooo"*30
                #     x = result.xpath(map_near)
                #     if len(x) > 0:
                #         # map : near the location
                #         results = results + parse_map_near(parsed_url, x, google_hostname)
                #     else:
                #         # map : detail about a location
                #         results = results + parse_map_detail(parsed_url, result, google_hostname)
                # # google news
                # elif parsed_url.path == search_path:
                #     # skipping news results
                #     pass

                # # images result
                # elif parsed_url.path == images_path:
                #     # only thumbnail image provided,
                #     # so skipping image results
                #     # results = results + parse_images(result, google_hostname)
                #     pass

            else:
                # normal result
                content = extract_text_from_dom(result, content_xpath)
                if content is None:
                    continue
                content_misc = extract_text_from_dom(result, content_misc_xpath)
                if content_misc is not None:
                    content = content_misc + "<br />" + content
                # append result
                results.append({'url': url,
                                'title': title,
                                'content': content
                                })
        except:
            logger.debug('result parse error in:\n%s', etree.tostring(result, pretty_print=True))
            continue

    # parse suggestion
    for suggestion in dom.xpath(suggestion_xpath):
        # append suggestion
        results.append({'suggestion': extract_text(suggestion)})

    for correction in dom.xpath(spelling_suggestion_xpath):
        results.append({'correction': extract_text(correction)})

    # return results
    return results

示例#16

0

显示文件

文件： google.py 项目： kvch/searx

def response(resp):
    results = []

    # detect google sorry
    resp_url = urlparse(resp.url)
    if resp_url.netloc == "sorry.google.com" or resp_url.path == "/sorry/IndexRedirect":
        raise RuntimeWarning("sorry.google.com")

    # which hostname ?
    google_hostname = resp.search_params.get("google_hostname")
    google_url = "https://" + google_hostname

    # convert the text to dom
    dom = html.fromstring(resp.text)

    # parse results
    for result in dom.xpath(results_xpath):
        try:
            title = extract_text(result.xpath(title_xpath)[0])
            url = parse_url(extract_url(result.xpath(url_xpath), google_url), google_hostname)
            parsed_url = urlparse(url, google_hostname)

            # map result
            if parsed_url.netloc == google_hostname:
                # TODO fix inside links
                continue
                # if parsed_url.path.startswith(maps_path) or parsed_url.netloc.startswith(map_hostname_start):
                #     print "yooooo"*30
                #     x = result.xpath(map_near)
                #     if len(x) > 0:
                #         # map : near the location
                #         results = results + parse_map_near(parsed_url, x, google_hostname)
                #     else:
                #         # map : detail about a location
                #         results = results + parse_map_detail(parsed_url, result, google_hostname)
                # # google news
                # elif parsed_url.path == search_path:
                #     # skipping news results
                #     pass

                # # images result
                # elif parsed_url.path == images_path:
                #     # only thumbnail image provided,
                #     # so skipping image results
                #     # results = results + parse_images(result, google_hostname)
                #     pass

            else:
                # normal result
                content = extract_text_from_dom(result, content_xpath)
                if content is None:
                    continue
                content_misc = extract_text_from_dom(result, content_misc_xpath)
                if content_misc is not None:
                    content = content_misc + "<br />" + content
                # append result
                results.append({"url": url, "title": title, "content": content})
        except:
            logger.debug("result parse error in:\n%s", etree.tostring(result, pretty_print=True))
            continue

    # parse suggestion
    for suggestion in dom.xpath(suggestion_xpath):
        # append suggestion
        results.append({"suggestion": extract_text(suggestion)})

    # return results
    return results

示例#17

0

显示文件

文件： google.py 项目： 912d/searx

def response(resp):
    results = []

    # detect google sorry
    resp_url = urlparse(resp.url)
    if resp_url.netloc == 'sorry.google.com' or resp_url.path == '/sorry/IndexRedirect':
        raise RuntimeWarning('sorry.google.com')

    # which hostname ?
    google_hostname = resp.search_params.get('google_hostname')
    google_url = "https://" + google_hostname

    # convert the text to dom
    dom = html.fromstring(resp.text)

    # parse results
    for result in dom.xpath(results_xpath):
        title = extract_text(result.xpath(title_xpath)[0])
        try:
            url = parse_url(extract_url(result.xpath(url_xpath), google_url), google_hostname)
            parsed_url = urlparse(url, google_hostname)

            # map result
            if ((parsed_url.netloc == google_hostname and parsed_url.path.startswith(maps_path))
               or (parsed_url.netloc.startswith(map_hostname_start))):
                x = result.xpath(map_near)
                if len(x) > 0:
                    # map : near the location
                    results = results + parse_map_near(parsed_url, x, google_hostname)
                else:
                    # map : detail about a location
                    results = results + parse_map_detail(parsed_url, result, google_hostname)

            # google news
            elif (parsed_url.netloc == google_hostname
                  and parsed_url.path == search_path):
                # skipping news results
                pass

            # images result
            elif (parsed_url.netloc == google_hostname
                  and parsed_url.path == images_path):
                # only thumbnail image provided,
                # so skipping image results
                # results = results + parse_images(result, google_hostname)
                pass

            else:
                # normal result
                content = extract_text_from_dom(result, content_xpath)
                if content is None:
                    continue
                content_misc = extract_text_from_dom(result, content_misc_xpath)
                if content_misc is not None:
                    content = content_misc + "<br />" + content
                # append result
                results.append({'url': url,
                                'title': title,
                                'content': content
                                })
        except:
            continue

    # parse suggestion
    for suggestion in dom.xpath(suggestion_xpath):
        # append suggestion
        results.append({'suggestion': escape(extract_text(suggestion))})

    # return results
    return results

示例#18

0

显示文件

def response(resp):
    results = []

    # detect google sorry
    resp_url = urlparse(resp.url)
    if resp_url.netloc == 'sorry.google.com' or resp_url.path == '/sorry/IndexRedirect':
        raise RuntimeWarning('sorry.google.com')

    # which hostname ?
    google_hostname = resp.search_params.get('google_hostname')
    google_url = "https://" + google_hostname

    # convert the text to dom
    dom = html.fromstring(resp.text)

    # parse results
    for result in dom.xpath(results_xpath):
        title = extract_text(result.xpath(title_xpath)[0])
        try:
            url = parse_url(extract_url(result.xpath(url_xpath), google_url),
                            google_hostname)
            parsed_url = urlparse(url, google_hostname)

            # map result
            if ((parsed_url.netloc == google_hostname
                 and parsed_url.path.startswith(maps_path))
                    or (parsed_url.netloc.startswith(map_hostname_start))):
                x = result.xpath(map_near)
                if len(x) > 0:
                    # map : near the location
                    results = results + parse_map_near(parsed_url, x,
                                                       google_hostname)
                else:
                    # map : detail about a location
                    results = results + parse_map_detail(
                        parsed_url, result, google_hostname)

            # google news
            elif (parsed_url.netloc == google_hostname
                  and parsed_url.path == search_path):
                # skipping news results
                pass

            # images result
            elif (parsed_url.netloc == google_hostname
                  and parsed_url.path == images_path):
                # only thumbnail image provided,
                # so skipping image results
                # results = results + parse_images(result, google_hostname)
                pass

            else:
                # normal result
                content = extract_text_from_dom(result, content_xpath)
                if content is None:
                    continue
                content_misc = extract_text_from_dom(result,
                                                     content_misc_xpath)
                if content_misc is not None:
                    content = content_misc + "<br />" + content
                # append result
                results.append({
                    'url': url,
                    'title': title,
                    'content': content
                })
        except:
            continue

    # parse suggestion
    for suggestion in dom.xpath(suggestion_xpath):
        # append suggestion
        results.append({'suggestion': extract_text(suggestion)})

    # return results
    return results

示例#19

0

显示文件

def response(resp):
    results = []

    # detect google sorry
    resp_url = urlparse(resp.url)
    if resp_url.netloc == 'sorry.google.com' or resp_url.path == '/sorry/IndexRedirect':
        raise RuntimeWarning('sorry.google.com')

    if resp_url.path.startswith('/sorry'):
        raise RuntimeWarning(gettext('CAPTCHA required'))

    # which hostname ?
    google_hostname = resp.search_params.get('google_hostname')
    google_url = "https://" + google_hostname

    # convert the text to dom
    dom = html.fromstring(resp.text)

    instant_answer = eval_xpath(dom, '//div[@id="_vBb"]//text()')
    if instant_answer:
        results.append({'answer': u' '.join(instant_answer)})
    try:
        results_num = int(
            eval_xpath(
                dom, '//div[@id="resultStats"]//text()')[0].split()[1].replace(
                    ',', ''))
        results.append({'number_of_results': results_num})
    except:
        pass

    # parse results
    for result in eval_xpath(dom, results_xpath):
        try:
            title = extract_text(eval_xpath(result, title_xpath)[0])
            url = parse_url(
                extract_url(eval_xpath(result, url_xpath), google_url),
                google_hostname)
            parsed_url = urlparse(url, google_hostname)

            # map result
            if parsed_url.netloc == google_hostname:
                # TODO fix inside links
                continue
                # if parsed_url.path.startswith(maps_path) or parsed_url.netloc.startswith(map_hostname_start):
                #     print "yooooo"*30
                #     x = eval_xpath(result, map_near)
                #     if len(x) > 0:
                #         # map : near the location
                #         results = results + parse_map_near(parsed_url, x, google_hostname)
                #     else:
                #         # map : detail about a location
                #         results = results + parse_map_detail(parsed_url, result, google_hostname)
                # # google news
                # elif parsed_url.path == search_path:
                #     # skipping news results
                #     pass

                # # images result
                # elif parsed_url.path == images_path:
                #     # only thumbnail image provided,
                #     # so skipping image results
                #     # results = results + parse_images(result, google_hostname)
                #     pass

            else:
                # normal result
                content = extract_text_from_dom(result, content_xpath)
                if content is None:
                    continue

                # append result
                results.append({
                    'url': url,
                    'title': title,
                    'content': content
                })
        except:
            logger.debug('result parse error in:\n%s',
                         etree.tostring(result, pretty_print=True))
            continue

    # parse suggestion
    for suggestion in eval_xpath(dom, suggestion_xpath):
        # append suggestion
        results.append({'suggestion': extract_text(suggestion)})

    for correction in eval_xpath(dom, spelling_suggestion_xpath):
        results.append({'correction': extract_text(correction)})

    # return results
    return results