def parse_item(self, response): isValid = response.css( '#ctl00_ContentPlaceHolder2_LabelCategoryDescription' ).extract_first() is not None if isValid: title = response.css( '#ctl00_ContentPlaceHolder2_LabelCategoryName::text').get() description = response.css( '#ctl00_ContentPlaceHolder2_LabelCategoryDescription::text' ).get() + cleanString( response.css( '#ctl00_ContentPlaceHolder2_LabelCategoryText').extract()) yield CouponsItem( Title=title, supplier='998', brand=filterBrands( cleanString( response.css( '#ctl00_ContentPlaceHolder2_LabelCategoryName::text' ).get()), self.brands), JoinUrl=response.url, Description=description, ScrapeDate=datetime.now().strftime("%m/%d/%Y, %H:%M:%S"), DoorToDoorShipping=any(ext in (description + title) for ext in allowed_shipping_list), cyclerun=self.cycleid)
def parse(self, response): description = cleanString( response.css("div#full-description-text").extract()) if not description: description = cleanString( response.css("div.banner-club-big-text-box").extract()) greenbox = cleanString( response.css("h1.productTitle").extract()) + cleanString( response.css("div.productSubTitle").extract()) big_redbox = cleanString( response.css("td.product-list-checkboxes").extract()) if similar(greenbox, big_redbox) > 0.9: title = greenbox else: low_price = re.search(r"'PriceDiscount':\s'\d{1,}'", str(response.body)).group(0) if re.search( r"'PriceDiscount':\s'\d{1,}'", str(response.body)) else '' title = greenbox + self.integrator + big_redbox + low_price.replace( "'PriceDiscount':", '') yield CouponsItem( Title=title, supplier='16', brand=filterBrands( cleanString(response.css("h1.productTitle").extract()), self.brands), JoinUrl=response.url, Description=description, ScrapeDate=datetime.now().strftime("%m/%d/%Y, %H:%M:%S"), DoorToDoorShipping=any(ext in (description + title) for ext in allowed_shipping_list), cyclerun=self.cycleid)
def parse(self, response): m = re.search(r'https://www\.yours\.co\.il/benefits/(.+)/', response.url) if m: benefit_id = m.group(1) formdata = { 'club_id': f'{self.siteUuid}', 'benefits_id': f'{benefit_id}' } r = requests.post(self.getBenefitDetails, json=formdata) if r.status_code == 200: data = r.json().get('benefits')[0] description = cleanString(data['benefits_description']) title = cleanString(data['benefits_name']) yield CouponsItem( Title=title, supplier='993', brand=filterBrands(description, self.brands), JoinUrl=self.linkBase + data['benefits_id'], Description=description, ScrapeDate=datetime.now().strftime("%m/%d/%Y, %H:%M:%S"), DoorToDoorShipping=any(ext in (description + title) for ext in allowed_shipping_list), cyclerun=self.cycleid) else: self.logger.error("something went wrong:::" + r.status_code)
def parse_coupon(self, response): isValid = response.css('i.cafe_logo').extract_first() is not None if isValid: description = cleanString(response.css('span.small_text').extract()) title=cleanString(response.css('div.richtext_div').extract_first()) yield CouponsItem(Title=title, supplier='990', brand=filterBrands(cleanString(response.css('div.d_section > h1::text').get())[:-3],self.brands), JoinUrl=self.linkBase.format(response.meta['d_number']), Description=description, ScrapeDate = datetime.now().strftime("%m/%d/%Y, %H:%M:%S"), DoorToDoorShipping= any(ext in (description+title) for ext in allowed_shipping_list), cyclerun=self.cycleid )
def parse(self, response): isValid = response.css('.benefit-details-txt').extract_first() is not None if isValid: description=cleanString(response.css("div.benefit-details-txt").extract()) title=cleanString(response.css("div.benefit-info h1::text").extract_first()) yield CouponsItem(Title=title, supplier='996', brand=filterBrands(cleanString(response.css("div.benefit-info h1::text").extract_first()),self.brands), JoinUrl=response.url, Description=description, ScrapeDate = datetime.now().strftime("%m/%d/%Y, %H:%M:%S"), DoorToDoorShipping= any(ext in (description+title) for ext in allowed_shipping_list), cyclerun=self.cycleid )
def parse(self, response): description = cleanString( response.css('ul.product-info li:nth-child(6)').extract()) title = cleanString(response.css("#ptitle::text").get()) yield CouponsItem( Title=title, supplier='994', brand=filterBrands( cleanString(response.css("#ptitle::text").get()), self.brands), JoinUrl=response.url, Description=description, ScrapeDate=datetime.now().strftime("%m/%d/%Y, %H:%M:%S"), DoorToDoorShipping=any(ext in (description + title) for ext in allowed_shipping_list), cyclerun=self.cycleid)
def parse_item(self, response): description = cleanString(response.css("div.desc").extract()) title = cleanString( response.css("div.info-box h1.title").extract_first()) yield CouponsItem( Title=title, supplier='1', brand=filterBrands( urllib.parse.unquote(response.meta["brandname"]), self.brands), JoinUrl=response.url, Description=description, ScrapeDate=datetime.now().strftime("%m/%d/%Y, %H:%M:%S"), DoorToDoorShipping=any(ext in (description + title) for ext in allowed_shipping_list), cyclerun=self.cycleid)
def classfy(self, response): """second url callback""" gooditem = CouponsItem() url = response.url print('parse', response.meta) head = url.split('.')[0] if head == 'http://search': ids = self._parseGoods(response) gooditem['good'] = (response.meta, ids) yield gooditem elif head == 'http://what': print('what') elif head == 'http://mall': print('mall') elif head == 'http://hotel': print('hotel')
def parse(self, response): result = json.loads(response.body) n = result.get('meta').get('totalitems') if n > 0: data = result.get('data') for r in data: for sale in r.get('attributes').get('sales'): description = cleanString(sale['description']) brandName = filterBrands(cleanString(sale['brand']['title']),self.brands) title=cleanString(sale['title']) title=f'{brandName} {title}' yield CouponsItem(Title=title, supplier='2', brand= brandName, JoinUrl= f"https://myofer.co.il{response.meta['mallName']}brands/{sale['brand']['seoname']}/{sale['id']}", Description=description, ScrapeDate = datetime.now().strftime("%m/%d/%Y, %H:%M:%S"), DoorToDoorShipping= any(ext in (description+title) for ext in allowed_shipping_list), cyclerun=self.cycleid )
def parse_item(self, response): isValid = response.css( 'div.text_more_info_white.benefits').extract_first() is not None validCoupon = cleanString( response.css("h2.page-title::text").extract_first()) if isValid and validCoupon and (validCoupon != ""): description = cleanString(response.css("div#main").extract()) title = cleanString( response.css("h2.page-title::text").extract_first()) yield CouponsItem( Title=title, supplier='997', brand=filterBrands( cleanString( response.css("h2.page-title::text").extract_first()), self.brands), JoinUrl=response.url, Description=description, ScrapeDate=datetime.now().strftime("%m/%d/%Y, %H:%M:%S"), DoorToDoorShipping=any(ext in (description + title) for ext in allowed_shipping_list), cyclerun=self.cycleid)
def parse(self, response): lst = response.css("nav.content ul.center li a").extract() for a in lst: soup = BeautifulSoup(a, 'lxml') input_tag = soup.find("a") href = input_tag.get('href') title = input_tag.get('title') img_tag = soup.find("img") desc = img_tag.get('src') m = re.search( r'https:\/\/www\.dreamcard\.co\.il\/special-offers\/', href) if m: yield CouponsItem( Title=cleanString(title), supplier='101', brand=filterBrands(title, self.brands), JoinUrl=href + "&&" + str(convertToNumber(title)), Description=desc, ScrapeDate=datetime.now().strftime("%m/%d/%Y, %H:%M:%S"), DoorToDoorShipping=any(ext in (title) for ext in allowed_shipping_list), cyclerun=self.cycleid)
def parse(self, response): if self.starter: yield scrapy.Request(self.home, meta={'selenium': False}, callback=self.anonymous_scrape) self.starter = False isValid = response.css( '.benefitInfo_content').extract_first() is not None if isValid: if 'online.max.co.il' in response.url: description = cleanString( response.css("div.richHtml p").extract()) else: description = cleanString( response.css( "#ctl00_PlaceHolderMain_ctl00_divInitializeWrapperClass p" ).extract()) title = cleanString( response.css( "div.benefitInfo_content h2::text").extract_first()) yield CouponsItem( Title=title, supplier='100' if (re.search(r'/Biz/Pages/', response.url)) else '995' if (re.search(r'/BeyahadBishvilha/Pages/', response.url)) else '995', brand=filterBrands( cleanString( response.css("div.benefitInfo_content h1::text"). extract_first()), self.brands), JoinUrl=response.url, Description=description, ScrapeDate=datetime.now().strftime("%m/%d/%Y, %H:%M:%S"), DoorToDoorShipping=any(ext in (description + title) for ext in allowed_shipping_list), cyclerun=self.cycleid)