Пример #1
0
def check_search_result_integrity(grab):
    if grab.doc.code != 200:
        raise DataNotValid('Non-200 HTTP code: %d' % grab.doc.code)
    elif not b'"status":"success"' in grab.doc.body:
        #grab.doc.save('/tmp/x.html')
        #print('not success')
        #import pdb; pdb.set_trace()
        raise DataNotValid('JSON success status not found')
Пример #2
0
def check_search_result_integrity(grab):
    if grab.doc.code == 403:
        raise RequestBanned('Ban (HTTP code %d)' % grab.doc.code)
    elif grab.doc.code != 200:
        raise DataNotValid('Non-200 HTTP code: %d' % grab.doc.code)
    elif not grab.doc('//link[contains(@href, "opensearch") and '
                      'contains(@title, "DuckDuckGo")]').exists():
        raise DataNotValid('Expected HTML element not found')
Пример #3
0
def check_search_result_integrity(grab):
    #if grab.doc.code == 403:
    #    raise RequestBanned('Ban (HTTP code %d)' % grab.doc.code)
    if grab.doc.code != 200:
        raise DataNotValid('Non-200 HTTP code: %d' % grab.doc.code)
    elif not grab.doc('//meta[@name="generator" and'
                      ' contains(@content, "searx/")]').exists():
        raise DataNotValid('Expected HTML element not found')
Пример #4
0
def check_search_result_integrity(grab):
    if ('your Internet connection has been prevented from accessing it'
            in grab.doc.unicode_body()):
        raise RequestBanned('Found ban message')
    if grab.doc('//form[@id="captcha_form"]').exists():
        raise RequestBanned('Found captcha form')
    if grab.doc.code != 200:
        raise DataNotValid('Non-200 HTTP code: %d' % grab.doc.code)
    elif not grab.doc('//base[contains(@href, "ixquick.com")]').exists():
        grab.doc.save('/tmp/x.html')
        import pdb
        pdb.set_trace()
        raise DataNotValid('Expected HTML element not found')
Пример #5
0
def check_ajax_search_result_integrity(grab):
    #if grab.doc.code == -1: # FIX
    #    raise RequestBanned('Ban (HTTP code %d)' % grab.doc.code)
    if grab.doc.code != 200:
        raise DataNotValid('Non-200 HTTP code: %d' % grab.doc.code)
    else:
        try:
            info = grab.doc.json
        except (TypeError, ValueError) as ex:
            raise DataNotValid('Not valid JSON')
        if not info.get('body', {}).get('serp'):
            raise DataNotValid('body->serp key not found')
        elif info['body']['antirobot']['blocked']:
            raise RequestBanned('Ban!')
Пример #6
0
def check_integrity(grab):
    #if grab.doc.code == -1: # FIX
    #    raise RequestBanned('Ban (HTTP code %d)' % grab.doc.code)
    if False:  #grab.doc.select('//img[contains(@src, "/captchaimg?")]').exists():
        raise RequestBanned('Ban (captcha)')
    elif grab.doc.code != 200:
        raise DataNotValid('Non-200 HTTP code: %d' % grab.doc.code)
Пример #7
0
def check_integrity(grab):
    #if grab.doc.code == -1: # FIX
    #    raise RequestBanned('Ban (HTTP code %d)' % grab.doc.code)
    if grab.doc.select('//img[contains(@src, "/captchaimg?")]').exists():
        raise RequestBanned('Ban (captcha)')
    elif grab.doc.code != 200:
        #grab.doc.save('/tmp/x.html')
        #print('NOT 200 CODE')
        #import pdb; pdb.set_trace()
        raise DataNotValid('Non-200 HTTP code: %d' % grab.doc.code)
Пример #8
0
def check_cache_integrity(grab):
    check_integrity(grab)
    if not grab.doc('//div[@class="cacheContent"]').exists():
        raise DataNotValid('Div[@class="cacheContent"] not found')
Пример #9
0
def check_search_result_integrity(grab):
    check_integrity(grab)
    if not grab.doc('//input[@name="p"]').exists():
        raise DataNotValid('Search query input not found')
Пример #10
0
def check_cache_integrity(grab):
    check_integrity(grab)
    if grab.doc.code == 404:
        pass
    elif not grab.doc('//div[@id="google-cache-hdr"]').exists():
        raise DataNotValid('Google Cache Header not found')
Пример #11
0
def check_search_result_integrity(grab):
    check_integrity(grab)
    if not grab.doc('//div[@id="res"]').exists():
        raise DataNotValid('Content of response has unexpected format.')
Пример #12
0
def check_cache_integrity(grab):
    check_integrity(grab)
    if not grab.doc('//script[contains(@src,'
                    '"yandex.st/hilitedaemon-js")]').exists():
        raise DataNotValid('Expected yandex.st script not found')
Пример #13
0
def check_search_result_integrity(grab):
    check_integrity(grab)
    if not grab.doc('//input[@name="text"]').exists():
        raise DataNotValid('Expected HTML element not found')