Пример #1
0
    def download_files(self, response):
        try:
            logger.info(f'ZooScraper : download_files : {response}')
            zip_urls = []
            # get download file link
            html = html_xml.fromstring(response.text)

            links = html.xpath(
                "//a[@class='js-navigation-open link-gray-dark']/@href")
            # url=self.base_url+links[3]
            for url in links:
                git_url = self.base_url + url
                response = requests.get(git_url)
                html = html_xml.fromstring(response.text)
                zip_links = html.xpath(
                    "//a[@class='js-navigation-open link-gray-dark']/@href")

                zip_url = self.base_url + zip_links[3]
                zip_urls.append(zip_url)
                loader = ItemLoader(item=MaliciousFileCrawlerItem())
                loader.add_value('file_urls', zip_url)
                yield loader.load_item()

        except Exception as err:
            logger.error(f'ZooScraper : download_files : {err}')
            raise err
Пример #2
0
 def test_load_item_using_default_loader(self):
     i = dict(summary='lala')
     il = ItemLoader(item=i)
     il.add_value('name', 'marta')
     item = il.load_item()
     assert item is i
     assert item['summary'] == ['lala']
     assert item['name'] == ['marta']
 def test_add_value_list_singlevalue(self):
     """Values added after initialization should be appended"""
     input_item = self.item_class(name=['foo', 'bar'])
     il = ItemLoader(item=input_item)
     il.add_value('name', 'qwerty')
     loaded_item = il.load_item()
     self.assertIsInstance(loaded_item, self.item_class)
     self.assertEqual(dict(loaded_item), {'name': ['foo', 'bar', 'qwerty']})
Пример #4
0
 def get_payments(self, payments) -> PaymentMethod:
     if payments:
         for payment in payments:
             loader = ItemLoader(item=PaymentMethod())
             loader.add_value('method', payment.get('method'))
             loader.add_value('installments',
                              self.get_installments(payment))
             yield loader.load_item()
Пример #5
0
 def parse(self, response):
     loader = ItemLoader(WikiPageItem(), response, response=response)
     loader.add_css('name', 'h1.firstHeading::text')
     loader.add_value('url', response.url)
     loader.add_css('img', 'td a.image img::attr(src)')
     links = self.link_extractor.extract_links(response)
     links_items = [{'url': link.url, 'text': link.text} for link in links]
     loader.add_value('links', links_items)
     yield from response.follow_all([link.url for link in links])
     yield loader.load_item()
Пример #6
0
 def parse_application(self, response):
     loader = ItemLoader(item=TraidingSsgeItem(), selector=response)
     loader._add_value('statement_date', response.request.meta['appdate'])
     loader.add_xpath('new_or_used',
                      "normalize-space(//div[@class='condition']/text())")
     loader.add_xpath('location',
                      "//div[@class='location-time']/div[2]/p/span/text()")
     loader.add_xpath(
         'last_updated',
         'normalize-space(//div[@class="location-time"]/div[2]/descendant::span[2]/text())'
     )
     loader.add_xpath('product',
                      "normalize-space(//h2[@class='main-title']/text())")
     loader.add_xpath(
         'price',
         "normalize-space(//div[@class='market-item-price ']/text())")
     loader.add_xpath(
         'currency_symbol',
         "normalize-space(//div[@class='market-item-price ']/span/text())")
     loader.add_xpath(
         'applicant', "normalize-space(//div[@class='author_type']/text())")
     loader.add_value('current_app_url', response.url)
     loader.add_xpath(
         'all_apps_url',
         "//div[@class='author_type']/descendant::span/a/@href")
     loader.add_xpath(
         'agent_or_person',
         "normalize-space((//div[@class='author_type'])[1]/span/a/text())")
     loader.add_xpath(
         'number_of_apps',
         "normalize-space(//div[@class='author_type']/descendant::span[2]/text())"
     )
     loader.add_xpath(
         'product_description',
         "normalize-space(//span[@class='details_text']/text())")
     loader.add_xpath(
         'product_specification',
         "normalize-space(//div[@class='jobs_details']/span/text())")
     loader.add_xpath(
         'product_condition_description',
         "normalize-space(//div[@class='jobs_details'][2]/span[2]/text())")
     loader.add_xpath(
         'seen',
         "normalize-space(//div[@class='article_views']/span/text())")
     loader.add_xpath(
         'app_id',
         "normalize-space(//div[@class='market-item-id']/span/text())")
     loader.add_xpath(
         'phone', "normalize-space(//div[@class='numbers-wrap']/a/@href)")
     print(loader.item)
     yield loader.load_item()
Пример #7
0
    def test_get_value(self):
        il = ItemLoader()
        self.assertEqual('FOO',
                         il.get_value(['foo', 'bar'], TakeFirst(), str.upper))
        self.assertEqual(['foo', 'bar'],
                         il.get_value(['name:foo', 'name:bar'],
                                      re='name:(.*)$'))
        self.assertEqual(
            'foo',
            il.get_value(['name:foo', 'name:bar'],
                         TakeFirst(),
                         re='name:(.*)$'))

        il.add_value('name', ['name:foo', 'name:bar'],
                     TakeFirst(),
                     re='name:(.*)$')
        self.assertEqual(['foo'], il.get_collected_values('name'))
        il.replace_value('name', 'name:bar', re='name:(.*)$')
        self.assertEqual(['bar'], il.get_collected_values('name'))
Пример #8
0
    def parse_single_hausnummer_page(self, response):
        meta = response.meta
        loader = ItemLoader(items.Street(), response)
        loader.default_output_processor = TakeFirst()
        loader.add_value('stadtteil', meta['stadtteil'])
        loader.add_value('name', meta['adresse'])
        loader.add_value('link', meta['link'])

        for script in response.css("script[type='text/javascript']").getall():
            if "demographicInfo" in script:
                loader.add_value('demographics', self.regex_demographics.search(script).group())
                break
        yield loader.load_item()
Пример #9
0
    def __callback(self, response: HtmlResponse):
        item = LeroymerlinItem()
        loader = ItemLoader(item=item, selector=response)
        loader.add_xpath(Fields.name, "//h1/text()")
        loader.add_value(Fields.link, response.url)
        loader.add_xpath(Fields.article_number,
                         "//span[@slot='article']/text()")
        loader.add_xpath(Fields.price, "//uc-pdp-price-view/span/text()")
        loader.add_xpath(Fields.image_links,
                         "//uc-pdp-media-carousel//img/@src")
        loader.add_value(Fields.image_paths, [])
        loader.add_value(Fields.category, self.__category)

        # 2)Написать универсальный обработчик характеристик товаров, который будет формировать данные вне зависимости от их типа и количества.
        loader.add_xpath(
            Fields.details,
            "//dl[@class='def-list']//dt/text() | //dl[@class='def-list']//dd/text()"
        )

        yield loader.load_item()
Пример #10
0
 def test_add_none(self):
     il = ItemLoader()
     il.add_value('name', None)
     assert il.get_collected_values('name') == []
Пример #11
0
 def test_add_zero(self):
     il = ItemLoader()
     il.add_value('name', 0)
     assert il.get_collected_values('name') == [0]
Пример #12
0
 def _test_item(self, item):
     il = ItemLoader()
     il.add_value('item_list', item)
     self.assertEqual(il.load_item(), {'item_list': [item]})
    def parse(self, response):
        current_page = response.meta['currentPage']
        json_resp = json.loads(response.text)
        houses = json_resp['cat1']['searchResults']['listResults']
        total_pages = json_resp['cat1']['searchList']['totalPages']

        for house in houses:
            loader = ItemLoader(item=ZillowItem())
            loader.add_value('id', house.get('id'))
            loader.add_value('image_urls', house.get('imgSrc'))
            loader.add_value('detail_url', house.get('detailUrl'))
            loader.add_value('status_type', house.get('statusType'))
            loader.add_value('status_text', house.get('statusText'))
            loader.add_value('price', house.get('price'))
            loader.add_value('address', house.get('address'))
            loader.add_value('beds', house.get('beds'))
            loader.add_value('baths', house.get('baths'))
            loader.add_value('area_sqft', house.get('area'))
            loader.add_value('latitude', house.get('latLong').get('latitude'))
            loader.add_value('longitude', house.get('latLong').get('longitude'))
            loader.add_value('broker_name', house.get('brokerName'))
            loader.add_value('broker_phone', house.get('brokerPhone'))
            yield loader.load_item()

        print({
            "houses": len(houses),
            "current_page": current_page,
            "total_pages": total_pages
        })

        if current_page <= total_pages:
            current_page += 1
            yield scrapy.Request(
                url=parse_new_url(URL, page_number=current_page),
                callback=self.parse,
                cookies=get_cookie(),
                meta={
                    'currentPage': current_page
                }
            )