class LivrosItem(scrapy.Item): titulo = scrapy.Item() preco = scrapy.Item() codigo = scrapy.Item() pagina = scrapy.Item() editora = scrapy.Item() autor = scrapy.Item() link = scrapy.Item()
def parse_item(self, response): self.logger.info('Hi, this is an item page! %s', response.url) item = scrapy.Item() item['id'] = response.xpath('//td[@id="item_id"]/text()').re( r'ID: (\d+)') item['name'] = response.xpath('//td[@id="item_name"]/text()').get() item['description'] = response.xpath( '//td[@id="item_description"]/text()').get() item['link_text'] = response.meta['link_text'] return item
def parse_item(self, response): self.logger.info('Hi, this is an item page! %s', response.url) item = scrapy.Item() item['id'] = response.xpath('//td[@id="item_id"]/text()').re( r'ID: (\d+)') item['name'] = response.xpath( '//*[@id="ContentPlaceHolder_Header_HeadingBread_TagH1"]').get() item['description'] = response.xpath( '//td[@id="item_description"]/text()').get() return item
def parse(self, response): #print(response.css('.module-typeD .list-bullet').extract()) #print(response.css('.module-typeD').extract()) #titles = response.xpath('//li[@class="item-title"]/a/text()').extract() item = scrapy.Item() items = [] divOB = response.xpath('.//div[@id="rss-outbreaksUS"]').extract() for p in divOB:
def parse_item(self, response): self.log('Hi, this is an item page! %s' % response.url) item = scrapy.Item() item['id'] = response.xpath('//td[@id="item_id"]/text()').re( r'ID: (\d+)') item['name'] = response.xpath('//td[@id="item_name"]/text()').extract() item['description'] = response.xpath( '//td[@id="item_description"]/text()').extract() return item
def parse_item(self, response): # TODO normal parser self.logger.info("Hi, this is an item page! %s", response.url) item = scrapy.Item() item["url"] = response.url item["title"] = response.xpath( '//h1[@data-qaid="title-h1"]/text()').extract_first().strip() item["price"] = response.xpath( '//span[@data-qaid="product-price"]/text()').extract_first().strip( ) return item
def parse_item(self, response): # i = {} # i['domain_id'] = response.xpath('//input[@id="sid"]/@value').extract() # i['name'] = response.xpath('//div[@id="name"]').extract() # i['description'] = response.xpath('//div[@id="description"]').extract() # return i self.logger.info('Hi, this is an item page! %s', response.url) item = scrapy.Item() item['id'] = response.xpath('//td[@id="item_id"]/text()').re(r'ID: (\d+)') item['name'] = response.xpath('//td[@id="item_name"]/text()').extract() item['description'] = response.xpath('//td[@id="item_description"]/text()').extract() return item
def parse_item(self, response): self.logger.info('Hi, this is an item page! %s', response.url) print(response) item = scrapy.Item() #item['id'] = response.xpath('//td[@id="item_id"]/text()').re(r'ID: (\d+)') #item['name'] = response.xpath('//td[@id="item_name"]/text()').extract() #item['description'] = response.xpath('//td[@id="item_description"]/text()').extract() #for c in response.xpath('//li'): # item["content"] = c.text() # print(item) # yield item item["title"] = response.xpath('//title/text()')
def parse_item(self, response): self.logger.info('Hi, this is an item page! %s', response.url) item = scrapy.Item() item['id'] = response.xpath('//td[@id="usage-link-sentry"]/text()') item['name'] = response.xpath( '//td[@id="usage-image-sentry"]/text()').get() item['description'] = response.xpath( '//td[@class="package-githubcommits"]/text()').get() item['link_text'] = response.meta['/packages/p/sentry/'] url = response.xpath('//td[@id="additional_data"]/@href').get() return response.follow(url, self.parse_additional_page, cb_kwargs=dict(item=item))
def parse_item(self, response): self.logger.info('Hi, this is an item page! %s', response.url) item = scrapy.Item() item['id'] = response.xpath('//td[@id="item_id"]/text()').re( r'ID: (\d+)') item['name'] = response.xpath('//td[@id="item_name"]/text()').get() item['description'] = response.xpath( '//td[@id="item_description"]/text()').get() item['link_text'] = response.meta['link_text'] url = response.xpath('//td[@id="additional_data"]/@href').get() return response.follow(url, self.parse_additional_page, cb_kwargs=dict(item=item))
def parse_item(self, response): item = scrapy.Item() yield item for url in response.xpath('//a/@href').extract(): if url and not url.startswith('#'): url = urljoin(response.url, url) if url.lower().endswith(tuple(IGNORED_EXTENSIONS)): continue url = url.split("?")[0] url = url.split("#")[0] if any(x in url for x in self.UNWANTED): continue elif self.parsed_url.netloc in url and url not in self.LINKS: try: scrapy.http.Request(url, meta={ 'dont_redirect': True, 'download_timeout': 20 }) except Exception: # pragma: no cover continue domain, subdomain, path = MySpider.format_url(url) # partitionKey is agency+org+domain+subdomain+path db_id = (f'{self.agency}+{self.organization}+' f'{domain}+{subdomain}+{path}') msg_body = json.dumps( dict(Agency=self.agency, Organization=self.organization, domain=domain, subdomain=subdomain, tld='gov', routeable_url=url, db_id=db_id)) entry = {'Id': '1', 'MessageBody': msg_body} send_message(entry) self.LINKS.add(url)
def test_process_unknown_item(self, pipeline, spider): item = scrapy.Item() processed_item = pipeline.process_item(item, spider) assert processed_item == item
def parse_item(self, response): item = scrapy.Item() return item
def parse_item(self, response): item = scrapy.Item() item['name'] = response.xpath('.//@data-name').extract()[0] item['tweet_count'] = response.css( '.ProfileNav-value::text').extract()[0] return item
def parse_rooms(self, response): res = requests.get(response.url, headers={'User-Agent': 'Mozilla/5.0'}).text soup = BeautifulSoup(res, 'html.parser') # price price = soup.find('span', {'class': 'room__sidebar--rate-base'}).text.strip() # building name building_name = soup.find('h5', {'class': 'room__location--title'}).text.strip() # room name room_name = soup.find('h1', { 'class': 'room__title'}).text.strip() # room features room_features = soup.find('div', {'class': 'room__features'}).get_text().split('\n') self.remove_empty_strings(room_features) print('crawling -> '+ response.url) # capacity capacity = soup.select('#body > div.global-wrapper > main > section.section-room > div.row > div.columns.small-12.medium-4.room-sidebar > div > div.room__sidebar--form-wrapper.loader-wrapper > div.room__sidebar--icons > ul > li:nth-child(1)')[0].text.strip() # location location = soup.find('div', { 'class': 'address'}).get_text().strip() # city of location cityOfLocation = self.get_city_of_location(location) # print('++++++++++++++++++++') # print('Room Name: '+ room_name) # print('++++++++++++++++++++') # print('++++++++++++++++++++') # print('Building Name: '+ building_name) # print('++++++++++++++++++++') # print('++++++++++++++++++++') # print('Room Features: ') # print(room_features) # print('++++++++++++++++++++') # print('++++++++++++++++++++') # print('price: '+ price) # print('++++++++++++++++++++') # print('++++++++++++++++++++') # print('Capacity: '+ capacity) # print('++++++++++++++++++++') # print('++++++++++++++++++++') # print('Location: '+ location) # print('++++++++++++++++++++') # print('++++++++++++++++++++') # print('City: '+ cityOfLocation) # print('++++++++++++++++++++') item = scrapy.Item() item.fields['city'] = cityOfLocation item.fields['buildingName'] = building_name item.fields['roomName'] = room_name item.fields['price'] = float(price) item.fields['capacity'] = int(capacity) item.fields['roomFeatures'] = room_features item.fields['location'] = location self.visited_url[response.url] = None yield item.fields
def parse(self, response): self.log('get response size: %s' % len(response.body)) item = scrapy.Item() return item
def parse_item(self, response): self.logger.info('Hi, this is an item page! %s', response.url) item = scrapy.Item()
class MercadolibreItem(scrapy.Item): titulo = scrapy.Item() descripcion = scrapy.Item() condiciones = scrapy.Item() precio = scrapy.Item() color = scrapy.Item() disponible = scrapy.Item() imagen_url = scrapy.Item() ubicacion = scrapy.Item() reputacion = scrapy.Item() antiguedad_mercadolibre = scrapy.Item() ventas_concretadas = scrapy.Item() url = scrapy.Item()