def download_files(self, response): try: logger.info(f'ZooScraper : download_files : {response}') zip_urls = [] # get download file link html = html_xml.fromstring(response.text) links = html.xpath( "//a[@class='js-navigation-open link-gray-dark']/@href") # url=self.base_url+links[3] for url in links: git_url = self.base_url + url response = requests.get(git_url) html = html_xml.fromstring(response.text) zip_links = html.xpath( "//a[@class='js-navigation-open link-gray-dark']/@href") zip_url = self.base_url + zip_links[3] zip_urls.append(zip_url) loader = ItemLoader(item=MaliciousFileCrawlerItem()) loader.add_value('file_urls', zip_url) yield loader.load_item() except Exception as err: logger.error(f'ZooScraper : download_files : {err}') raise err
def test_load_item_using_default_loader(self): i = dict(summary='lala') il = ItemLoader(item=i) il.add_value('name', 'marta') item = il.load_item() assert item is i assert item['summary'] == ['lala'] assert item['name'] == ['marta']
def test_add_value_list_singlevalue(self): """Values added after initialization should be appended""" input_item = self.item_class(name=['foo', 'bar']) il = ItemLoader(item=input_item) il.add_value('name', 'qwerty') loaded_item = il.load_item() self.assertIsInstance(loaded_item, self.item_class) self.assertEqual(dict(loaded_item), {'name': ['foo', 'bar', 'qwerty']})
def get_payments(self, payments) -> PaymentMethod: if payments: for payment in payments: loader = ItemLoader(item=PaymentMethod()) loader.add_value('method', payment.get('method')) loader.add_value('installments', self.get_installments(payment)) yield loader.load_item()
def parse(self, response): loader = ItemLoader(WikiPageItem(), response, response=response) loader.add_css('name', 'h1.firstHeading::text') loader.add_value('url', response.url) loader.add_css('img', 'td a.image img::attr(src)') links = self.link_extractor.extract_links(response) links_items = [{'url': link.url, 'text': link.text} for link in links] loader.add_value('links', links_items) yield from response.follow_all([link.url for link in links]) yield loader.load_item()
def parse_application(self, response): loader = ItemLoader(item=TraidingSsgeItem(), selector=response) loader._add_value('statement_date', response.request.meta['appdate']) loader.add_xpath('new_or_used', "normalize-space(//div[@class='condition']/text())") loader.add_xpath('location', "//div[@class='location-time']/div[2]/p/span/text()") loader.add_xpath( 'last_updated', 'normalize-space(//div[@class="location-time"]/div[2]/descendant::span[2]/text())' ) loader.add_xpath('product', "normalize-space(//h2[@class='main-title']/text())") loader.add_xpath( 'price', "normalize-space(//div[@class='market-item-price ']/text())") loader.add_xpath( 'currency_symbol', "normalize-space(//div[@class='market-item-price ']/span/text())") loader.add_xpath( 'applicant', "normalize-space(//div[@class='author_type']/text())") loader.add_value('current_app_url', response.url) loader.add_xpath( 'all_apps_url', "//div[@class='author_type']/descendant::span/a/@href") loader.add_xpath( 'agent_or_person', "normalize-space((//div[@class='author_type'])[1]/span/a/text())") loader.add_xpath( 'number_of_apps', "normalize-space(//div[@class='author_type']/descendant::span[2]/text())" ) loader.add_xpath( 'product_description', "normalize-space(//span[@class='details_text']/text())") loader.add_xpath( 'product_specification', "normalize-space(//div[@class='jobs_details']/span/text())") loader.add_xpath( 'product_condition_description', "normalize-space(//div[@class='jobs_details'][2]/span[2]/text())") loader.add_xpath( 'seen', "normalize-space(//div[@class='article_views']/span/text())") loader.add_xpath( 'app_id', "normalize-space(//div[@class='market-item-id']/span/text())") loader.add_xpath( 'phone', "normalize-space(//div[@class='numbers-wrap']/a/@href)") print(loader.item) yield loader.load_item()
def test_get_value(self): il = ItemLoader() self.assertEqual('FOO', il.get_value(['foo', 'bar'], TakeFirst(), str.upper)) self.assertEqual(['foo', 'bar'], il.get_value(['name:foo', 'name:bar'], re='name:(.*)$')) self.assertEqual( 'foo', il.get_value(['name:foo', 'name:bar'], TakeFirst(), re='name:(.*)$')) il.add_value('name', ['name:foo', 'name:bar'], TakeFirst(), re='name:(.*)$') self.assertEqual(['foo'], il.get_collected_values('name')) il.replace_value('name', 'name:bar', re='name:(.*)$') self.assertEqual(['bar'], il.get_collected_values('name'))
def parse_single_hausnummer_page(self, response): meta = response.meta loader = ItemLoader(items.Street(), response) loader.default_output_processor = TakeFirst() loader.add_value('stadtteil', meta['stadtteil']) loader.add_value('name', meta['adresse']) loader.add_value('link', meta['link']) for script in response.css("script[type='text/javascript']").getall(): if "demographicInfo" in script: loader.add_value('demographics', self.regex_demographics.search(script).group()) break yield loader.load_item()
def __callback(self, response: HtmlResponse): item = LeroymerlinItem() loader = ItemLoader(item=item, selector=response) loader.add_xpath(Fields.name, "//h1/text()") loader.add_value(Fields.link, response.url) loader.add_xpath(Fields.article_number, "//span[@slot='article']/text()") loader.add_xpath(Fields.price, "//uc-pdp-price-view/span/text()") loader.add_xpath(Fields.image_links, "//uc-pdp-media-carousel//img/@src") loader.add_value(Fields.image_paths, []) loader.add_value(Fields.category, self.__category) # 2)Написать универсальный обработчик характеристик товаров, который будет формировать данные вне зависимости от их типа и количества. loader.add_xpath( Fields.details, "//dl[@class='def-list']//dt/text() | //dl[@class='def-list']//dd/text()" ) yield loader.load_item()
def test_add_none(self): il = ItemLoader() il.add_value('name', None) assert il.get_collected_values('name') == []
def test_add_zero(self): il = ItemLoader() il.add_value('name', 0) assert il.get_collected_values('name') == [0]
def _test_item(self, item): il = ItemLoader() il.add_value('item_list', item) self.assertEqual(il.load_item(), {'item_list': [item]})
def parse(self, response): current_page = response.meta['currentPage'] json_resp = json.loads(response.text) houses = json_resp['cat1']['searchResults']['listResults'] total_pages = json_resp['cat1']['searchList']['totalPages'] for house in houses: loader = ItemLoader(item=ZillowItem()) loader.add_value('id', house.get('id')) loader.add_value('image_urls', house.get('imgSrc')) loader.add_value('detail_url', house.get('detailUrl')) loader.add_value('status_type', house.get('statusType')) loader.add_value('status_text', house.get('statusText')) loader.add_value('price', house.get('price')) loader.add_value('address', house.get('address')) loader.add_value('beds', house.get('beds')) loader.add_value('baths', house.get('baths')) loader.add_value('area_sqft', house.get('area')) loader.add_value('latitude', house.get('latLong').get('latitude')) loader.add_value('longitude', house.get('latLong').get('longitude')) loader.add_value('broker_name', house.get('brokerName')) loader.add_value('broker_phone', house.get('brokerPhone')) yield loader.load_item() print({ "houses": len(houses), "current_page": current_page, "total_pages": total_pages }) if current_page <= total_pages: current_page += 1 yield scrapy.Request( url=parse_new_url(URL, page_number=current_page), callback=self.parse, cookies=get_cookie(), meta={ 'currentPage': current_page } )