Пример #1
0
def test_skip_elems_by_selector():
    """
    should test for nested elements that should have been removed
    i.e. if a class='link' should be skipped when inside a
    parent element that is not skipped
    """
    spider = serialize.SerializeSpider()
    fake_response = TextResponse(encoding='utf-8',
                                 url='https://doc.scrapy.org')

    spider.exclude_selectors = ['.skip-elem']

    with open('test/data/skip_by_selector.html', mode='rb') as fd:
        body = fd.read()
        fake_response._set_body(body)

    item = spider.parse(fake_response)
    text = next(item).get('text').strip()

    with open('test/data/skip_by_selector.txt') as fd:
        test_text = fd.read().strip()

    # ignore trailing whitespace
    text = re.sub('\n\s', '\n', text).strip()
    test_text = re.sub('\n\s', '\n', test_text).strip()

    assert text == test_text
Пример #2
0
def test_block_elem_with_children():
    """
    tests 2 block elems with children
    ignores trailing whitespace
    """
    spider = serialize.SerializeSpider()

    fake_response = TextResponse(encoding='utf-8',
                                 url='https://doc.scrapy.org')

    with open('test/data/block_elem.html', mode='rb') as fd:
        body = fd.read()
        fake_response._set_body(body)

    item = spider.parse(fake_response)
    text = next(item).get('text')

    with open('test/data/block_elem.txt') as fd:
        test_text = fd.read()

    # ignore trailing whitespace
    text = re.sub('\n\s', '\n', text).strip()
    test_text = re.sub('\n\s', '\n', test_text).strip()

    assert text == test_text
Пример #3
0
def test_skip_elems_by_tag():
    """
    make sure elems are skipped by tag name e.g. script
    """
    spider = serialize.SerializeSpider()
    fake_response = TextResponse(encoding='utf-8',
                                 url='https://doc.scrapy.org')

    spider.exclude_tags.append('footer')

    with open('test/data/skip_by_tag.html', mode='rb') as fd:
        body = fd.read()
        fake_response._set_body(body)

    item = spider.parse(fake_response)
    text = next(item).get('text').strip()

    with open('test/data/skip_by_tag.txt') as fd:
        test_text = fd.read().strip()

    # ignore trailing whitespace
    text = re.sub('\n\s', '\n', text).strip()
    test_text = re.sub('\n\s', '\n', test_text).strip()

    assert text == test_text
Пример #4
0
def inspect_spider( s ):
    news = s()  
    try:
        req1 = list( news.start_requests() )[0]
        html1 = requests.get( req1.url ).content
        response1 = TextResponse( url = req1.url, body = html1, encoding = 'utf-8' )
        req2 = list( news.parse( response1 ) )[0]
        html2 = requests.get( req2.url ).content
        response2 = TextResponse( url = req2.url, body = html2, encoding = 'utf-8' )
        for d in news.parse_descr( response2 ):
            print("One course description you found is:", d )
            break
    except:
        print("Oh no! Something is wrong with the code. Keep trying!")
Пример #5
0
 def parse(self, response: TextResponse):
     items = response.css('ul.sellListContent li')
     for li in items:
         item = ScrapyLianjiaErshoufangItem()
         item['title'] = li.css('div.title a::text').get().replace(':', '').replace(',', ' ').replace("\n", '')
         house_infos = li.css('div.address .houseInfo::text').re(
             r'\|\s+(.*)\s+\|\s+(.*)平米\s+\|\s+(.*)\s+\|\s+(.*)\s+\|\s+(.*)')
         item['room'] = house_infos[0]
         item['area'] = house_infos[1]
         item['orientation'] = house_infos[2]
         item['decoration'] = house_infos[3]
         item['elevator'] = house_infos[4]
         item['xiaoqu'] = li.css('div.address a::text').get()
         item['flood'] = li.css('div.flood .positionInfo::text').get().replace('-', '').strip()
         item['location'] = li.css('div.flood .positionInfo a::text').get()
         follow_infos = li.css('div.followInfo::text').re(r'(.*)人关注\s+/\s+共(.*)次带看\s+/\s+(.*)发布')
         item['follow_number'] = follow_infos[0]
         item['look_number'] = follow_infos[1]
         item['pub_duration'] = follow_infos[2]
         item['total_price'] = li.css('div.priceInfo div.totalPrice span::text').get()
         unit_price = li.css('div.priceInfo .unitPrice span::text').re(r'单价(.*)元/平米')
         item['unit_price'] = unit_price[0]
         item['total_unit'] = li.css('div.totalPrice::text').get()
         item['crawl_time'] = datetime.now().strftime('%Y-%m-%d %H:%M:%S')
         item['house_id'] = self.genearteMD5(''.join((str(item['title']), str(item['room']), str(item['area']),
                                                      str(item['orientation']), str(item['elevator']),
                                                      str(item['xiaoqu']),
                                                      str(item['flood']), str(item['location']))))
         yield item
Пример #6
0
def test_text_cdr_item():
    response = TextResponse(url='http://example.com',
                            headers={
                                'Content-Type': 'text/plain',
                                'another-header': 'text/complain, text/explain'
                            },
                            body=b'a body',
                            encoding='utf8')
    item = text_cdr_item(response, crawler_name='crawler', team_name='team')
    item = dict(item)
    item_id = item.pop('_id')  # type: str
    assert item_id.isupper()
    check_timestamp_crawl(item)
    assert dict(item) == {
        'content_type': 'text/plain',
        'crawler': 'crawler',
        'objects': [],
        'raw_content': 'a body',
        'response_headers': {
            'content-type': 'text/plain',
            'another-header': 'text/complain, text/explain'
        },
        'team': 'team',
        'url': 'http://example.com',
        'version': 3.1
    }
def fakeResponseFromFile(file_name, url=None):
    if not url:
        url = 'http://www.example.com'
    file_path = getAbsolutePath(file_name)
    file_content = open(file_path, 'r').read()
    response = TextResponse(url=url,
                            request=Request(url=url),
                            body=file_content)
    return response
Пример #8
0
def test_stats(mocked_time):
    middleware = get_test_middleware()
    spider = Spider("foo")

    count = 100
    nums = list(range(count))
    random.shuffle(nums)
    status_list = [random.randint(1, 15) for _ in range(count)]
    method_list = [
        random.choice(["GET", "POST", "PUT", "DELETE", "HEAD"])
        for _ in range(count)
    ]

    # expected values
    latencies = [2**n - n for n in nums]
    total_latency = sum(latencies)
    avg_latency = total_latency / count
    max_latency = max(latencies)

    for n, status, method in zip(nums, status_list, method_list):
        request = Request("https://example.org", method=method)
        mocked_time.return_value = n  # start_ts
        processed_request = middleware.process_request(request, spider)

        response = TextResponse(
            url="https://example.org",
            request=processed_request,
            body=json.dumps({
                "headers": {},
                "original_status": status,
                "body": "",
                "url": "http://"
            }).encode("utf-8"),
        )

        mocked_time.return_value = 2**n  # end_ts
        middleware.process_response(processed_request, response, spider)

    middleware.spider_closed(spider, "finished")

    assert middleware.stats.get_value("crawlera_fetch/request_count") == count
    assert middleware.stats.get_value("crawlera_fetch/response_count") == count
    assert middleware.stats.get_value(
        "crawlera_fetch/total_latency") == total_latency
    assert middleware.stats.get_value(
        "crawlera_fetch/avg_latency") == avg_latency
    assert middleware.stats.get_value(
        "crawlera_fetch/max_latency") == max_latency
    for status in set(status_list):
        sc = middleware.stats.get_value(
            "crawlera_fetch/response_status_count/{}".format(status))
        assert sc == status_list.count(status)
    for method in set(method_list):
        mc = middleware.stats.get_value(
            "crawlera_fetch/request_method_count/{}".format(method))
        assert mc == method_list.count(method)
Пример #9
0
    def process_request(self, request, spider):
        # 但是这样不能保持并发, 只能一个一个url, 如何改进
        if spider.name == 'suning_phone' and request.meta.get('use_selenium'):

            html = self.fetch_dynamic_html(request.url)

            return TextResponse(request.url,
                                encoding='utf-8',
                                body=html,
                                request=request)
 def parse_exercise(self, response: Response):
     # `ItemLoader` will only accept (subclasses of) `TextResponse`, so we forge a
     # `TextResponse` with everything of the actual response except `body`.
     response_copy = TextResponse(
         url=response.url,
         status=response.status,
         headers=response.headers,
         flags=response.flags,
         request=response.request,
     )
     return self.exercise_loader.parse(response_copy)
Пример #11
0
 def test_feed_url(self):
     url = 'http://example.com/feed'
     feed = FeedGenerator(lambda: 0)
     response = TextResponse(url, body=(
         'http://example.com/1\r'
         'http://example.com/2\r\n'
         'http://example.com/3\n\r'
         'http://example.com/4\n'))
     self.assertEqual([r.url for r in feed.parse_urls(response)], [
         'http://example.com/1',
         'http://example.com/2',
         'http://example.com/3',
         'http://example.com/4',
     ])
Пример #12
0
def test_process_response_skip():
    response = TextResponse(
        url="https://example.org",
        status=200,
        headers={
            "Content-Encoding": "gzip",
            "Transfer-Encoding": "chunked",
            "Date": "Fri, 24 Apr 2020 18:06:42 GMT",
        },
        request=Request(url="https://example.org",
                        meta={"crawlera_fetch": {
                            "skip": True
                        }}),
        body=b"""<html></html>""",
    )

    middleware = get_test_middleware()
    processed = middleware.process_response(response.request, response,
                                            Spider("foo"))

    assert response is processed
Пример #13
0
 def test_write(self):
     self.response = TextResponse(url=self.url, body="OK".encode("utf-8"))
     resp = self.cfr.process_response(self.request, self.response,
                                      self.spider)
     self.assertIsInstance(resp, TextResponse)
def test_process_response_error():
    response_list = [
        TextResponse(
            url="https://crawlera.com/fake/api/endpoint",
            request=Request(
                url="https://crawlera.com/fake/api/endpoint",
                meta={
                    "crawlera_fetch": {
                        "timing": {
                            "start_ts": mocked_time()
                        },
                        "original_request":
                        request_to_dict(
                            Request("https://example.org"),
                            spider=foo_spider,
                        ),
                    }
                },
            ),
            headers={
                "X-Crawlera-Error": "bad_proxy_auth",
                "Proxy-Authenticate": 'Basic realm="Crawlera"',
                "Content-Length": "0",
                "Date": "Mon, 04 May 2020 13:06:15 GMT",
                "Proxy-Connection": "close",
                "Connection": "close",
            },
        ),
        TextResponse(
            url="https://crawlera.com/fake/api/endpoint",
            request=Request(
                url="https://crawlera.com/fake/api/endpoint",
                meta={
                    "crawlera_fetch": {
                        "timing": {
                            "start_ts": mocked_time()
                        },
                        "original_request":
                        request_to_dict(
                            Request("https://example.org"),
                            spider=foo_spider,
                        ),
                    }
                },
            ),
            body=b'{"Bad": "JSON',
        ),
        TextResponse(
            url="https://crawlera.com/fake/api/endpoint",
            request=Request(
                url="https://crawlera.com/fake/api/endpoint",
                meta={
                    "crawlera_fetch": {
                        "timing": {
                            "start_ts": mocked_time()
                        },
                        "original_request":
                        request_to_dict(
                            Request("https://example.org"),
                            spider=foo_spider,
                        ),
                    }
                },
            ),
            body=json.dumps({
                "url":
                "https://example.org",
                "original_status":
                503,
                "headers": {},
                "crawlera_status":
                "fail",
                "crawlera_error":
                "serverbusy",
                "body_encoding":
                "plain",
                "body":
                "Server busy: too many outstanding requests",
            }),
            encoding="utf8",
        ),
    ]

    middleware_raise = get_test_middleware(
        settings={"CRAWLERA_FETCH_RAISE_ON_ERROR": True})
    for response in response_list:
        with pytest.raises(CrawleraFetchException):
            middleware_raise.process_response(response.request, response,
                                              foo_spider)

    assert middleware_raise.stats.get_value(
        "crawlera_fetch/response_error") == 3
    assert middleware_raise.stats.get_value(
        "crawlera_fetch/response_error/bad_proxy_auth") == 1
    assert middleware_raise.stats.get_value(
        "crawlera_fetch/response_error/JSONDecodeError") == 1
    assert middleware_raise.stats.get_value(
        "crawlera_fetch/response_error/serverbusy") == 1

    middleware_log = get_test_middleware(
        settings={"CRAWLERA_FETCH_RAISE_ON_ERROR": False})
    with LogCapture() as logs:
        for response in response_list:
            processed = middleware_log.process_response(
                response.request, response, foo_spider)
            assert response is processed

    logs.check_present(
        (
            "crawlera-fetch-middleware",
            "WARNING",
            "Error downloading <GET https://example.org> (status: 200, X-Crawlera-Error header: bad_proxy_auth)",  # noqa: E501
        ),
        (
            "crawlera-fetch-middleware",
            "WARNING",
            "Error decoding <GET https://example.org> (status: 200, message: Unterminated string starting at, lineno: 1, colno: 9)",  # noqa: E501
        ),
        (
            "crawlera-fetch-middleware",
            "WARNING",
            "Error downloading <GET https://example.org> (Original status: 503, Fetch API error message: Server busy: too many outstanding requests, Request ID: unknown)",  # noqa: E501
        ),
    )

    assert middleware_log.stats.get_value("crawlera_fetch/response_error") == 3
    assert middleware_log.stats.get_value(
        "crawlera_fetch/response_error/bad_proxy_auth") == 1
    assert middleware_log.stats.get_value(
        "crawlera_fetch/response_error/JSONDecodeError") == 1
    assert middleware_log.stats.get_value(
        "crawlera_fetch/response_error/serverbusy") == 1
Пример #15
0
                    },
                    "original_request":
                    request_to_dict(
                        Request("https://fake.host.com"),
                        spider=foo_spider,
                    ),
                }
            },
        ),
        body=
        b"""{"url":"https://fake.host.com","original_status":123,"headers":{"fake-header":"true"},"body":"foobar"}""",  # noqa: E501
    ),
    "expected":
    TextResponse(
        url="https://fake.host.com",
        status=123,
        headers={"Fake-Header": "true"},
        body=b"""foobar""",  # noqa: E501
    ),
})

test_responses.append({
    "original":
    HtmlResponse(
        url=SETTINGS["CRAWLERA_FETCH_URL"],
        status=200,
        headers={
            "Content-Type": "application/json",
            "Content-Encoding": "gzip",
            "Transfer-Encoding": "chunked",
            "Date": "Fri, 24 Apr 2020 18:06:42 GMT",
            "Proxy-Connection": "close",
Пример #16
0
 def getResponse(self, url, browser):
     res = TextResponse(url, body=browser.page_source.encode("utf-8"))
     return res
Пример #17
0
def test_process_response_error():
    response_list = [
        TextResponse(
            url="https://crawlera.com/fake/api/endpoint",
            request=Request(
                url="https://crawlera.com/fake/api/endpoint",
                meta={
                    "crawlera_fetch": {
                        "timing": {
                            "start_ts": mocked_time()
                        },
                        "original_request": {
                            "url": "https://example.org",
                            "method": "GET"
                        },
                    }
                },
            ),
            headers={
                "X-Crawlera-Error": "bad_proxy_auth",
                "Proxy-Authenticate": 'Basic realm="Crawlera"',
                "Content-Length": "0",
                "Date": "Mon, 04 May 2020 13:06:15 GMT",
                "Proxy-Connection": "close",
                "Connection": "close",
            },
        ),
        TextResponse(
            url="https://crawlera.com/fake/api/endpoint",
            request=Request(
                url="https://crawlera.com/fake/api/endpoint",
                meta={
                    "crawlera_fetch": {
                        "timing": {
                            "start_ts": mocked_time()
                        },
                        "original_request": {
                            "url": "https://example.org",
                            "method": "GET"
                        },
                    }
                },
            ),
            body=b'{"Bad": "JSON',
        ),
    ]

    middleware_raise = get_test_middleware(
        settings={"CRAWLERA_FETCH_RAISE_ON_ERROR": True})
    for response in response_list:
        with pytest.raises(CrawleraFetchException):
            middleware_raise.process_response(response.request, response,
                                              Spider("foo"))

    assert middleware_raise.stats.get_value(
        "crawlera_fetch/response_error") == 2
    assert middleware_raise.stats.get_value(
        "crawlera_fetch/response_error/bad_proxy_auth") == 1
    assert middleware_raise.stats.get_value(
        "crawlera_fetch/response_error/JSONDecodeError") == 1

    middleware_log = get_test_middleware(
        settings={"CRAWLERA_FETCH_RAISE_ON_ERROR": False})
    with LogCapture() as logs:
        for response in response_list:
            processed = middleware_log.process_response(
                response.request, response, Spider("foo"))
            assert response is processed

    logs.check_present(
        (
            "crawlera-fetch-middleware",
            "ERROR",
            "Error downloading <GET https://example.org> (status: 200, X-Crawlera-Error header: bad_proxy_auth)",  # noqa: E501
        ),
        (
            "crawlera-fetch-middleware",
            "ERROR",
            "Error decoding <GET https://example.org> (status: 200, message: Unterminated string starting at, lineno: 1, colno: 9)",  # noqa: E501
        ),
    )

    assert middleware_log.stats.get_value("crawlera_fetch/response_error") == 2
    assert middleware_log.stats.get_value(
        "crawlera_fetch/response_error/bad_proxy_auth") == 1
    assert middleware_log.stats.get_value(
        "crawlera_fetch/response_error/JSONDecodeError") == 1
Пример #18
0
    def test_get_review_items_from_rdfa_using_alphr_syntax(self):
        # This is extract from http://alphr.com/go/1006047 on 2017/08/08
        html_text = '''
                        <body id="pid-htc-1006047-htc-u11-review-htcs-flagship-is-a-squeezy-pleaser" class="html not-front not-logged-in page-node page-node- page-node-1006047 node-type-review one-sidebar sidebar-second narrow-stacked snap" 
                         prefix="v: http://rdf.data-vocabulary.org/# schema: http://schema.org/">
                        <div id="main" class="page-main-area" typeof="schema:Review">
                        <a id="main-content-area"></a> 
                        <main id="group-content" class="group group-content" >
                        <div id="page_title_content">
                           <h1 id="page-title" class="page-title title">HTC U11 review: HTC&#039;s flagship is a squeezy pleaser</h1>
                           <span property="schema:headline" content="HTC U11 review: HTC&#039;s flagship is a squeezy pleaser " class="rdf-meta element-hidden"></span>
                        </div>
                        <div id="content" class="region region-content content">
                        <div id="block-system-main" class="block block-system">
                        <div class="content">
                        <div class="node node-review odd node-full">
                        <div class="content">
                        <span property="schema:itemReviewed" content="HTC U11"></span>
                        <div class="field field-name-kicker field-label-inline">
                           <div class="field-items"><a href="/htc">HTC</a></div>
                        </div>
                        <h2 class="short-teaser" property="schema:description">Shiny but pricey; HTC once again falls into the Samsung comparison trap</h2>
                        <div class="field-group-format group_meta required-fields group-meta">
                           <span class="field field-name-field-author field-type-node-reference field-label-hidden">
                              <span class="field-item even" property="schema:author" typeof="schema:Person">
                                 <div class="node node-author node-sticky even node-inline-block" >
                                    <div class="content" >
                                       <div class="field field-name-field-author-first-name field-type-text field-label-hidden">
                                          <div class="field-items">
                                             <div class="field-item even"><span property="schema:name"><a href="http://www.alphr.com/authors/alan-martin" title="Alan Martin" class="author-link" property="schema:url">Alan Martin</a></span></div>
                                          </div>
                                       </div>
                                       <div class="field field-name-field-twitter-username field-type-text field-label-hidden">
                                          <div class="field-items">
                                             <div class="field-item even"><a href="http://www.twitter.com/alan_p_martin" class="follow-button-twitter" target="_blank" title="Follow on Twitter" rel="">@alan_p_martin</a></div>
                                          </div>
                                       </div>
                                    </div>
                                 </div>
                              </span>
                           </span>
                           <div class="field-name-field-published-date" ><span class="date-display-single" property="schema:datePublished" content="2017-07-12" datatype="xsd:dateTime">12 Jul 2017</span></div>
                        </div>
                        <div class="field field-name-field-review-score-overall field-type-fivestar field-label-hidden" property="schema:reviewRating" typeof="schema:Rating"><div class="field-items">
                            <div class="field-item even" property="schema:ratingValue" content="5"></div>
                        </div>
                        </body>
                    '''
        response = TextResponse(url='http://alphr.com/go/1006047',
                                body=html_text)

        rdfa_items = extruct_helper.extract_all_rdfa(response)
        self.assertIsNotNone(rdfa_items)

        review = extruct_helper.get_review_items_from_rdfa(
            response, rdfa_items)
        self.assertEqual(len(review), 1)

        review = review[0]
        self.assertEqual(review['ProductName'], 'HTC U11')
        self.assertEquals(review['Author'], 'Alan Martin')
        self.assertEquals(review['TestDateText'], '2017-07-12')
        self.assertEquals(
            review['TestTitle'],
            "HTC U11 review: HTC's flagship is a squeezy pleaser")
        self.assertEquals(
            review['TestSummary'],
            'Shiny but pricey; HTC once again falls into the Samsung comparison trap'
        )