Exemplo n.º 1
0
def test_request_ua():
    headers = {
        "User-Agent": "Python3.5"
    }
    request = Request('http://www.httpbin.org/get', method='GET', res_type='json', headers=headers)
    result = asyncio.get_event_loop().run_until_complete(request.fetch())
    assert result.html['headers']['User-Agent'] == "Python3.5"
Exemplo n.º 2
0
def test_method_error_request():
    try:
        request = Request("https://httpbin.org/", method="PUT")
        response = asyncio.get_event_loop().run_until_complete(request.fetch())
        assert await response.text() == ""
    except Exception as e:
        assert isinstance(e, InvalidRequestMethod)
Exemplo n.º 3
0
 async def _localImage_or_webImage_parse(self, request: Request,
                                         spider_ins):
     '''
     process image-data(loacl-image or web-image) during middle.request
     :param request: Request
     '''
     _raw_url = request.url
     if _raw_url.startswith('https'):
         logger.error('Baidu-ocr does not support remote https image link,'
                      'check your start_urls')
         request.retry_times = 0
         raise ImageTypeError
     elif _raw_url.startswith('http'):
         # self._service_payload.update(url=_raw_url)
         self._service_payload.pop('image', None)
         self._service_payload.update(url=_raw_url)
     else:
         if _raw_url[-3:] not in ['jpg', 'png', 'bmp', 'peg']:
             logger.error('Baidu does not support this type of picture , '
                          'must be `jpg`, `png`, `bmp` or `jpeg`')
             request.retry_times = 0
             request._ok = False
             raise ImageTypeError
         else:
             image, _path = await self.get_ocr_image(
                 _raw_url,
                 request,
                 spider_ins,
                 hook=self.get_ocr_image_hook,
                 region=spider_ins.ocr_options['region'])
             self._service_payload.pop('url', None)
             self._service_payload.update(image=image)
Exemplo n.º 4
0
def test_method_error_request():
    try:
        request = Request('https://httpbin.org/', method='PUT')
        response = asyncio.get_event_loop().run_until_complete(request.fetch())
        assert response.html == ''
    except Exception as e:
        assert isinstance(e, InvalidRequestMethod)
Exemplo n.º 5
0
def test_request_params():
    params = {
        "name": "ruia"
    }
    request = Request('http://www.httpbin.org/get', method='GET', res_type='json', params=params)
    result = asyncio.get_event_loop().run_until_complete(request.fetch())
    assert result.html['args']['name'] == "ruia"
Exemplo n.º 6
0
def test_delay_false():
    request_config = {"DELAY": 10}
    request = Request("https://httpbin.org/", request_config=request_config)

    # Start a timer to time request
    timer = time.time()
    response = asyncio.get_event_loop().run_until_complete(request.fetch(delay=False))

    # Ensure delay option was ignored (time taken is less than 10s)
    assert time.time() - timer < 10
Exemplo n.º 7
0
async def sec_request():
    form_data, must_cookies, history = await request_example()
    headers = {
        'User-Agent': ('Mozilla/5.0'),
    }
    request = Request(url='http://portal.neaea.gov.et/Student/StudentDetailsx',
                      method='POST',
                      headers=headers,
                      metadata=form_data,
                      cookies=must_cookies)
    print(request)
    return request.fetch()
Exemplo n.º 8
0
def test_retry_delay():
    # Test invalid URL (to trigger retries) with 1s delay between retries
    request_config = {"RETRIES": 2, "RETRY_DELAY": 1}
    request = Request("http://127.0.0.1:5999/", request_config=request_config)

    # Start a timer to time retries
    timer = time.time()
    _, response = asyncio.get_event_loop().run_until_complete(
        request.fetch_callback(sem=sem))

    # Ensure that for 2 retries the time taken is > 2s (1s between each retry)
    assert time.time() - timer > 2
Exemplo n.º 9
0
async def request_example():
    url = 'http://portal.neaea.gov.et/Home/Student'
    params = {
        'name': 'ruia',
    }
    headers = {
        'User-Agent': ('Mozilla/5.0'),
    }
    request = Request(url=url, method='GET', params=params, headers=headers)
    must_cookies = {}
    must_cookies_names = ['__RequestVerificationToken']

    response = await request.fetch()
    for cookie_name in must_cookies_names:
        must_cookies[cookie_name] = response.cookies.get(cookie_name)
    history = response.history
    text = await response.text()
    html = Selector(text=text)
    csrf_token = html.xpath("/html/body/div[2]/div/form/input/@value").get()

    form_data = {
        '__RequestVerificationToken': csrf_token,
        'admissionNumber': None  # to be set
    }
    return form_data, must_cookies, history
Exemplo n.º 10
0
    async def async_fetch(
        self,
        url_or_request: Union[Request, str],
        response: Response = None,
    ):
        """
        Fetch target URL
        :param url_or_request:
        :param response:
        :return:
        """
        async with aiohttp.ClientSession() as session:
            if isinstance(url_or_request, Request):
                request: Request = url_or_request
                request.request_session = session
            else:
                request: Request = Request(url=url_or_request,
                                           request_session=session)

            if response is None:
                response: Response = await request.fetch()

            # process response
            response.html = await response.text()
            response.etree = response.html_etree(response.html)

            self.refresh_user_ns(request, response)
Exemplo n.º 11
0
 async def parse(self, response):
     for index, url in enumerate(self.start_urls):
         yield Request(url,
                       method='POST',
                       data=self.body,
                       callback=self.parse_item,
                       metadata={'index': index})
Exemplo n.º 12
0
 async def parse(self, response):
     yield Request(
         url=response.url,
         callback=self.parse_item,
         headers=self.headers,
         request_config=self.request_config,
         **self.kwargs
     )
Exemplo n.º 13
0
async def make_post_request(sem, callback):
    headers = {'Content-Type': 'application/json'}
    request = Request('https://httpbin.org/post',
                      method='POST',
                      headers=headers,
                      data=params,
                      callback=callback)
    return await request.fetch_callback(sem)
Exemplo n.º 14
0
 async def parse(self, res):
     pages = ['http://www.httpbin.org/get', 'http://www.httpbin.org/get']
     for index, page in enumerate(pages):
         yield Request(
             page,
             callback=self.parse_item,
             metadata={'index': index}
         )
Exemplo n.º 15
0
 async def parse(self, response):
     self.mongo_db = MotorBase().get_db('ruia_test')
     urls = ['https://news.ycombinator.com/news?p=1', 'https://news.ycombinator.com/news?p=2']
     for index, url in enumerate(urls):
         yield Request(
             url,
             callback=self.parse_item,
             metadata={'index': index}
         )
Exemplo n.º 16
0
 async def parse(self, res):
     items = await alist(ArchivesItem.get_items(html=res.html))
     self.mongo_db = MotorBase(loop=self.loop).get_db()
     for item in items:
         # 随机休眠
         self.request_config['DELAY'] = random.randint(5, 10)
         yield Request(item.href,
                       callback=self.parse_item,
                       request_config=self.request_config)
Exemplo n.º 17
0
Arquivo: request.py Projeto: ziux/ruia
async def request_example():
    url = "https://httpbin.org/get"
    params = {"name": "ruia"}
    headers = {"User-Agent": "Python3.6"}
    request = Request(
        url=url, method="GET", res_type="json", params=params, headers=headers
    )
    response = await request.fetch()
    assert response.html["args"]["name"] == "ruia"
    assert response.html["headers"]["User-Agent"] == "Python3.6"
Exemplo n.º 18
0
 async def parse(self, response):
     self.mongo_db = MotorBase().get_db('hacknews')
     urls = ['https://baijiahao.baidu.com/s?id=1553475025395018',
             'https://baijiahao.baidu.com/s?id=1570895803249513']
     for index, url in enumerate(urls):
         yield Request(
             url,
             callback=self.parse_item,
             metadata={'index': index}
         )
Exemplo n.º 19
0
async def make_post_request(sem, callback):
    headers = {"Content-Type": "application/json"}
    request = Request(
        "https://httpbin.org/post",
        method="POST",
        headers=headers,
        data=params,
        callback=callback,
    )
    return await request.fetch_callback(sem)
Exemplo n.º 20
0
 async def parse(self, response):
     self.mongo_db = MotorBase().get_db("ruia_test")
     urls = [
         "https://news.ycombinator.com/news?p=1",
         "https://news.ycombinator.com/news?p=2",
     ]
     for index, url in enumerate(urls):
         yield Request(url,
                       callback=self.parse_item,
                       metadata={"index": index})
Exemplo n.º 21
0
 async def parse(self, res):
     try:
         self.mongo_db = MotorBase(loop=self.loop).get_db()
     except Exception as e:
         self.logger.exception(e)
     async for item in ArchivesItem.get_items(html=await res.text()):
         yield Request(
             item.href,
             callback=self.parse_item,
             request_config=self.request_config,
         )
Exemplo n.º 22
0
 async def parse(self, res):
     etree = res.html_etree
     pages = ['?start=0&filter='
              ] + [i.get('href') for i in etree.cssselect('.paginator>a')]
     for index, page in enumerate(pages):
         url = self.start_urls[0] + page
         yield Request(url,
                       callback=self.parse_item,
                       metadata={'index': index},
                       request_config=self.request_config,
                       **self.kwargs)
Exemplo n.º 23
0
 async def timeout_request(sem):
     request_config = {"RETRIES": 1, "DELAY": 1, "TIMEOUT": 0.1}
     request = Request(
         "https://httpbin.org/get",
         method="GET",
         metadata={"hello": "ruia"},
         encoding="utf-8",
         request_config=request_config,
         params=params,
         callback=hi,
     )
     return await request.fetch_callback(sem)
Exemplo n.º 24
0
def test_request_config():
    assert str(Request("https://httpbin.org/")) == "<GET https://httpbin.org/>"
    _, response = asyncio.get_event_loop().run_until_complete(
        make_get_request(sem=sem, callback=hello))
    # assert response.callback_result == "hello ruia"
    assert response.metadata == {"hello": "ruia"}
    json_result = asyncio.get_event_loop().run_until_complete(response.json())
    assert json_result["args"]["name"] == "ruia"

    _, response = asyncio.get_event_loop().run_until_complete(
        make_post_request(sem=sem, callback=None))
    json_result = asyncio.get_event_loop().run_until_complete(response.json())
    assert json_result["data"] == "name=ruia"
Exemplo n.º 25
0
def test_request_config():
    assert str(Request('https://httpbin.org/')) == '<GET https://httpbin.org/>'
    _, response = asyncio.get_event_loop().run_until_complete(
        make_get_request(sem=sem, callback=hello))
    assert response.callback_result == 'hello ruia'
    assert response.metadata == {'hello': 'ruia'}
    json_result = asyncio.get_event_loop().run_until_complete(response.json())
    assert json_result['args']['name'] == "ruia"

    _, response = asyncio.get_event_loop().run_until_complete(
        make_post_request(sem=sem, callback=None))
    json_result = asyncio.get_event_loop().run_until_complete(response.json())
    assert json_result['data'] == "name=ruia"
Exemplo n.º 26
0
 async def parse(self, res):
     etree = res.html_etree
     urls = [
         i.get('href') for i in etree.cssselect('.content_list .dd_bt a')
     ]
     for index, url in enumerate(urls):
         url = 'http:' + url
         yield Request(
             url,
             callback=self.parse_item,
             metadata={'index': index},
             request_config=self.request_config,
         )
Exemplo n.º 27
0
    async def parse_item(self, res):
        async for item in ArticleListItem.get_items(html=await res.text()):
            # 已经抓取的链接不再请求
            is_exist = (await self.mongo_db.source_docs.find_one(
                {"url": item.href}) or {})

            if not is_exist.get("html"):
                yield Request(
                    item.href,
                    callback=self.save,
                    metadata={"title": item.title},
                    request_config=self.request_config,
                )
Exemplo n.º 28
0
 async def parse_item(self, res):
     items = await alist(ArticleListItem.get_items(html=res.html))
     for item in items:
         # 已经抓取的链接不再请求
         is_exist = await self.mongo_db.source_docs.find_one(
             {'url': item.href})
         if not is_exist:
             # 随机休眠
             self.request_config['DELAY'] = random.randint(5, 10)
             yield Request(item.href,
                           callback=self.save,
                           metadata={'title': item.title},
                           request_config=self.request_config)
Exemplo n.º 29
0
 async def timeout_request(sem):
     request_config = {
         'RETRIES': 1,
         'DELAY': 1,
         'TIMEOUT': 0.1,
     }
     request = Request('https://httpbin.org/get',
                       method='GET',
                       metadata={'hello': 'ruia'},
                       encoding='utf-8',
                       request_config=request_config,
                       params=params,
                       callback=hi)
     return await request.fetch_callback(sem)
Exemplo n.º 30
0
    async def parse_item(self, res):
        items = await ArticleListItem.get_items(html=res.html)
        for item in items:
            # 已经抓取的链接不再请求
            is_exist = await self.mongo_db.source_docs.find_one(
                {'url': item.href}) or {}

            if not is_exist.get('html'):
                yield Request(
                    item.href,
                    callback=self.save,
                    metadata={'title': item.title},
                    request_config=self.request_config,
                )