class UniqclubSpider(scrapy.Spider): name = 'uniqClub' undetectable = False wait = False allowed_domains = ['uniq-club.co.il'] start_urls = ['http://uniq-club.co.il/'] apiBase = 'https://www.uniq-club.co.il/discounts/ajax/{}' linkBase = 'https://www.uniq-club.co.il/discounts#{}' brands = getBrands() def __init__(self, *args, **kwargs): super(UniqclubSpider, self).__init__(*args, **kwargs) self.cycleid = kwargs.get('cycleid', '') def parse(self, response): for d in range(0, 1000): uri = self.apiBase.format(d) yield scrapy.Request(uri, callback=self.parse_coupon, method='POST', meta={'d_number':d}) def parse_coupon(self, response): isValid = response.css('i.cafe_logo').extract_first() is not None if isValid: description = cleanString(response.css('span.small_text').extract()) title=cleanString(response.css('div.richtext_div').extract_first()) yield CouponsItem(Title=title, supplier='990', brand=filterBrands(cleanString(response.css('div.d_section > h1::text').get())[:-3],self.brands), JoinUrl=self.linkBase.format(response.meta['d_number']), Description=description, ScrapeDate = datetime.now().strftime("%m/%d/%Y, %H:%M:%S"), DoorToDoorShipping= any(ext in (description+title) for ext in allowed_shipping_list), cyclerun=self.cycleid )
class IsracardSpider(CrawlSpider): name = 'isracard' undetectable = False wait = False allowed_domains = ['benefits.isracard.co.il'] start_urls = ['https://benefits.isracard.co.il/'] brands = getBrands() rules = [ Rule(LinkExtractor(allow=(),tags=('div','a','area','button',), attrs=('onclick', ),process_value=itemHandler), callback='parse',process_request=my_selenium_request_processor,follow=False), Rule(LinkExtractor(allow=(),tags=('div','a','area','button',), attrs=('onclick', ),process_value=categoryHandler),process_request=my_selenium_request_processor,follow=True), Rule(LinkExtractor(allow=(),tags=('div','a','area','button',), attrs=('onclick', ),process_value=mainHandler),callback='parse',process_request=my_selenium_request_processor,follow=True) ] def __init__(self, *args, **kwargs): super(IsracardSpider, self).__init__(*args, **kwargs) self.cycleid = kwargs.get('cycleid', '') def parse(self, response): isValid = response.css('.benefit-details-txt').extract_first() is not None if isValid: description=cleanString(response.css("div.benefit-details-txt").extract()) title=cleanString(response.css("div.benefit-info h1::text").extract_first()) yield CouponsItem(Title=title, supplier='996', brand=filterBrands(cleanString(response.css("div.benefit-info h1::text").extract_first()),self.brands), JoinUrl=response.url, Description=description, ScrapeDate = datetime.now().strftime("%m/%d/%Y, %H:%M:%S"), DoorToDoorShipping= any(ext in (description+title) for ext in allowed_shipping_list), cyclerun=self.cycleid )
class DreamcardSpider(scrapy.Spider): name = 'dreamcard' undetectable = False wait = False allowed_domains = ['dreamcard.co.il'] start_urls = ['https://www.dreamcard.co.il/special-offers'] brands = getBrands() def __init__(self, *args, **kwargs): super(DreamcardSpider, self).__init__(*args, **kwargs) self.cycleid = kwargs.get('cycleid', '') def parse(self, response): lst = response.css("nav.content ul.center li a").extract() for a in lst: soup = BeautifulSoup(a, 'lxml') input_tag = soup.find("a") href = input_tag.get('href') title = input_tag.get('title') img_tag = soup.find("img") desc = img_tag.get('src') m = re.search( r'https:\/\/www\.dreamcard\.co\.il\/special-offers\/', href) if m: yield CouponsItem( Title=cleanString(title), supplier='101', brand=filterBrands(title, self.brands), JoinUrl=href + "&&" + str(convertToNumber(title)), Description=desc, ScrapeDate=datetime.now().strftime("%m/%d/%Y, %H:%M:%S"), DoorToDoorShipping=any(ext in (title) for ext in allowed_shipping_list), cyclerun=self.cycleid)
class MyoferSpider(scrapy.Spider): name = 'myofer' undetectable = False wait = False allowed_domains = ['myofer.co.il'] start_urls = ['http://myofer.co.il/'] token = '' brands = getBrands() BASE_URL = 'https://api-mobile.myofer.co.il/v2/sales' def __init__(self, *args, **kwargs): super(MyoferSpider, self).__init__(*args, **kwargs) self.cycleid = kwargs.get('cycleid', '') def start_requests(self): for url in self.start_urls: yield scrapy.Request(url, callback=self.parse_malls,meta={'selenium':True,'myoferToken':True}) def parse_malls(self, response): self.token = next(item for item in response.meta["cookieJar"] if item["name"] == "token").get('value') result = [re.search(r"/(.+)/category/all-benefits",i).group(0) for i in response.css("a.mall-name-expanded::attr(href)").extract() if re.search(r"/(.+)/category/all-benefits",i)] for uri in result: yield scrapy.Request(f'https://myofer.co.il{uri}',callback=self.parse_mallId) def parse_mallId(self, response): headers = {'Accept': 'Accept: application/json','Authorization':f'Bearer {self.token}'} idString = re.search(r"mallId=\d+&",str(response.body)).group(0) if re.search(r"mallId=\d+&",str(response.body)) else None if idString: d = re.search(r"\d+",idString).group(0) if re.search(r"\d+",idString) else None if d: mallname = re.search(r"/(.+)/",re.search(r"\.il/(.+)/c",str(response.url)).group(0)).group(0) if re.search(r"\.il/(.+)/c",str(response.url)) else None if mallname: params = { 'mallId': d, 'limit': 100000 } url = f'{self.BASE_URL}?{urllib.parse.urlencode(params)}' yield scrapy.Request(url, callback=self.parse, headers=headers, meta={'mallName':mallname}) def parse(self, response): result = json.loads(response.body) n = result.get('meta').get('totalitems') if n > 0: data = result.get('data') for r in data: for sale in r.get('attributes').get('sales'): description = cleanString(sale['description']) brandName = filterBrands(cleanString(sale['brand']['title']),self.brands) title=cleanString(sale['title']) title=f'{brandName} {title}' yield CouponsItem(Title=title, supplier='2', brand= brandName, JoinUrl= f"https://myofer.co.il{response.meta['mallName']}brands/{sale['brand']['seoname']}/{sale['id']}", Description=description, ScrapeDate = datetime.now().strftime("%m/%d/%Y, %H:%M:%S"), DoorToDoorShipping= any(ext in (description+title) for ext in allowed_shipping_list), cyclerun=self.cycleid )
class DinersSpider(CrawlSpider): name = 'diners' undetectable = True wait = True elementId = 'cal-shop-brand' allowed_domains = ['diners-store.co.il'] start_urls = ['https://www.diners-store.co.il/'] brands = getBrands() integrator = '-כותרת משנה' rules = [ Rule(LinkExtractor(allow=(), process_value=itemHandler), callback='parse', process_request=my_selenium_request_processor, follow=False), Rule(LinkExtractor(allow=(), process_value=categoryHandler), process_request=my_selenium_request_processor, follow=True) ] def __init__(self, *args, **kwargs): super(DinersSpider, self).__init__(*args, **kwargs) self.cycleid = kwargs.get('cycleid', '') def parse(self, response): description = cleanString( response.css("div#full-description-text").extract()) if not description: description = cleanString( response.css("div.banner-club-big-text-box").extract()) greenbox = cleanString( response.css("h1.productTitle").extract()) + cleanString( response.css("div.productSubTitle").extract()) big_redbox = cleanString( response.css("td.product-list-checkboxes").extract()) if similar(greenbox, big_redbox) > 0.9: title = greenbox else: low_price = re.search(r"'PriceDiscount':\s'\d{1,}'", str(response.body)).group(0) if re.search( r"'PriceDiscount':\s'\d{1,}'", str(response.body)) else '' title = greenbox + self.integrator + big_redbox + low_price.replace( "'PriceDiscount':", '') yield CouponsItem( Title=title, supplier='16', brand=filterBrands( cleanString(response.css("h1.productTitle").extract()), self.brands), JoinUrl=response.url, Description=description, ScrapeDate=datetime.now().strftime("%m/%d/%Y, %H:%M:%S"), DoorToDoorShipping=any(ext in (description + title) for ext in allowed_shipping_list), cyclerun=self.cycleid)
class OgenSpider(CrawlSpider): name = 'ogen' undetectable = False wait = False allowed_domains = ['ogen.org.il'] start_urls = ['https://ogen.org.il/'] brands = getBrands() # ajax_url = 'https://ogen.org.il/wp-admin/admin-ajax.php' # payload = "------WebKitFormBoundary7MA4YWxkTrZu0gW\r\nContent-Disposition: form-data; name=\"action\"\r\n\r\nmatat_filter\r\n------WebKitFormBoundary7MA4YWxkTrZu0gW\r\nContent-Disposition: form-data; name=\"data\"\r\n\r\nminPrice=0&maxPrice=1000\r\n------WebKitFormBoundary7MA4YWxkTrZu0gW\r\nContent-Disposition: form-data; name=\"security\"\r\n\r\ncb03a93ccd\r\n------WebKitFormBoundary7MA4YWxkTrZu0gW--" # headers = { # 'content-type': "multipart/form-data; boundary=----WebKitFormBoundary7MA4YWxkTrZu0gW", # 'cache-control': "no-cache", # 'User-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/84.0.4147.135 Safari/537.36' # } rules = [ Rule( LinkExtractor(allow=('/product-category/')), follow=True, ), Rule(LinkExtractor(allow=('/product/')), callback='parse') ] # def start_requests(self): # yield scrapy.Request(self.start_urls[0], callback=self.ajax_parse) # def ajax_parse(self, response): # result = requests.request("POST", self.ajax_url, data=self.payload, headers=self.headers) # response = HtmlResponse(self.ajax_url, body=result.text, encoding='utf-8') # products = [i for i in response.css("a::attr(href)").extract() if re.search(r"/product/",i)] # for links in products: # yield scrapy.Request(links, callback=self.parse) # return super(OgenSpider, self).start_requests() def __init__(self, *args, **kwargs): super(OgenSpider, self).__init__(*args, **kwargs) self.cycleid = kwargs.get('cycleid', '') def parse(self, response): description = cleanString( response.css("div.short-info p::text").getall()) title = cleanString( response.css("h2.product-name::text").get()) + cleanString( response.css("div.price").extract()) yield CouponsItem( Title=title, supplier='992', brand=filterBrands( cleanString(response.css("h2.product-name::text").get()), self.brands), JoinUrl=response.url, Description=description, ScrapeDate=datetime.now().strftime("%m/%d/%Y, %H:%M:%S"), DoorToDoorShipping=any(ext in (description + title) for ext in allowed_shipping_list), cyclerun=self.cycleid)
class PaisplusSpider(CrawlSpider): name = 'pais' undetectable = True wait = True elementId = 'accesability_container' allowed_domains = ['paisplus.co.il'] apiBase = 'https://data.dolcemaster.co.il' start_urls = ['https://paisplus.co.il/'] siteUuid = 'BBAD629F-E549-4612-9EAE-3AA9E85F1C33' linkBase = 'https://www.paisplus.co.il/benefits/' getBenefitDetails = urllib.parse.urljoin( apiBase, f'api/v5_1/public/benefits_details') headers = {'Accept': 'Accept: application/json'} brands = getBrands() rules = [ Rule(LinkExtractor(allow=('/category/')), process_request=my_selenium_request_processor, follow=True), Rule(LinkExtractor(allow=('/benefits/')), callback='parse', process_request=my_selenium_request_processor, follow=False) ] def __init__(self, *args, **kwargs): super(PaisplusSpider, self).__init__(*args, **kwargs) self.cycleid = kwargs.get('cycleid', '') def parse(self, response): m = re.search(r'https://www\.paisplus\.co\.il/benefits/(.+)/', response.url) if m: benefit_id = m.group(1) formdata = { 'club_id': f'{self.siteUuid}', 'benefits_id': f'{benefit_id}' } r = requests.post(self.getBenefitDetails, json=formdata) if r.status_code == 200: data = r.json().get('benefits')[0] description = cleanString(data['benefits_description']) title = cleanString(data['benefits_name']) yield CouponsItem( Title=title, supplier='991', brand=filterBrands(description, self.brands), JoinUrl=self.linkBase + data['benefits_id'], Description=description, ScrapeDate=datetime.now().strftime("%m/%d/%Y, %H:%M:%S"), DoorToDoorShipping=any(ext in (description + title) for ext in allowed_shipping_list), cyclerun=self.cycleid)
class HvrSpider(CrawlSpider): name = 'hvr' undetectable = True elementId = 'wrap' wait = True allowed_domains = ['hvr.co.il'] start_urls = [] signin_url = 'https://hvr.co.il/signin.aspx' usrEId,username = '******','052046133' pwdEId,password = '******','5167722' brands = getBrands() headers = {'User-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/84.0.4147.135 Safari/537.36'} rules = [ Rule(LinkExtractor(allow=(),tags=('div','a','area','button'),attrs=('data-item_id',),process_value=process_item_id),callback="parse_item",follow=False), Rule(LinkExtractor(allow=(),tags=('div','a','area','button'),attrs=('href',),process_value=process_item_href),callback="parse_item",follow=False), Rule(LinkExtractor(allow=(),tags=('div','a','area','button'),attrs=('href',),process_value=process_lst),callback="parse_lst",follow=True), Rule(LinkExtractor(allow=(),tags=('div','a','area','button'),attrs=('href',),process_value=process_cat),follow=True) ] def start_requests(self): yield scrapy.Request(self.signin_url, callback=self.after_login,meta={"selenium":True,"login":True,"elementId":"tz"}) def after_login(self,response): result = [re.search(r"(.+?)\.json",i).group(0) for i in response.css("div::attr(title)").extract() if re.search(r"(.+?)\.json",i)] result2 = [re.search(r"(.+?)\.json",i).group(0) for i in response.css("div::attr(data-json)").extract() if re.search(r"(.+?)\.json",i)] for uri in result+result2: uri = re.search(r"(?<=\\).*",uri).group(0) if re.search(r"(?<=\\).*",uri) else uri r = requests.get('https://www.hvr.co.il/ajax/'+uri,headers=self.headers) m = re.findall(r"(page|url)':\s?'(.+?)'",str(r.json())) if r.status_code == 200 else None if m: for t,s in m: n = re.search(r"(?=home_page\.aspx).*", s) if t == 'page': url = 'https://www.hvr.co.il/home_page.aspx?page=' + s elif t == 'url' and n: url = 'https://www.hvr.co.il/'+n.group(0) self.start_urls.append(url) self.start_urls.append(response.url) return super(HvrSpider, self).start_requests() def parse_lst(self, response): template_links = re.findall(r'template_link:\s?"(.+)\d{5,8}"',str(response.body)) for uri in template_links: yield scrapy.Request(urllib.parse.urljoin('https://www.hvr.co.il/',uri), callback=self.parse_item) def parse_item(self, response): print(response.url) parse_start_url = parse_item
class TovSpider(CrawlSpider): name = 'tov' undetectable = True wait = False allowed_domains = ['tov.org.il'] start_urls = [] signin_url = 'https://www.tov.org.il/signin.aspx' usrEId, username = '******', '025190273' pwdEId, password = '******', '2535271' brands = getBrands() rules = [ Rule(LinkExtractor(allow=(), tags=('div', 'a', 'area', 'button'), attrs=('data-href', 'href', 'onclick'), process_value=itemHandler), callback='parse_item', process_request=my_selenium_request_processor, follow=False), Rule(LinkExtractor(allow=(), tags=('div', 'a', 'area', 'button'), attrs=('data-href', 'href', 'onclick'), process_value=categoryHandler), process_request=my_selenium_request_processor, follow=True) ] def start_requests(self): yield scrapy.Request(self.signin_url, callback=self.after_login, meta={ "selenium": True, "login": True }) def after_login(self, response): self.start_urls.append(response.url) return super(TovSpider, self).start_requests() def parse_item(self, response): with open('log.txt', 'a') as f: f.write(response.url + '\n') parse_start_url = parse_item
class MegaleanSpider(CrawlSpider): name = 'megalean' undetectable = False wait = False allowed_domains = ['megalean.co.il'] start_urls = ['https://www.megalean.co.il/site/pg/home'] brands = getBrands() rules = [ Rule(LinkExtractor(allow=(), attrs=('onclick', ), process_value=process_item), callback='parse', follow=True), Rule(LinkExtractor(allow=(), tags=('div', ), attrs=('data-href', ), process_value=process_divLinks), process_request=my_selenium_request_processor, follow=True), Rule(LinkExtractor(allow=('/search/', '/cat_')), process_request=my_selenium_request_processor, follow=True), ] def __init__(self, *args, **kwargs): super(MegaleanSpider, self).__init__(*args, **kwargs) self.cycleid = kwargs.get('cycleid', '') def parse(self, response): description = cleanString( response.css('ul.product-info li:nth-child(6)').extract()) title = cleanString(response.css("#ptitle::text").get()) yield CouponsItem( Title=title, supplier='994', brand=filterBrands( cleanString(response.css("#ptitle::text").get()), self.brands), JoinUrl=response.url, Description=description, ScrapeDate=datetime.now().strftime("%m/%d/%Y, %H:%M:%S"), DoorToDoorShipping=any(ext in (description + title) for ext in allowed_shipping_list), cyclerun=self.cycleid)
class BehatsdaSpider(CrawlSpider): name = 'behatsda' undetectable = True wait = True elementId = 'aspnetForm' allowed_domains = ['behatsdaa.org.il'] login_url = 'https://behatsdaa.org.il/' start_urls = ['https://www.behatsdaa.org.il/HomePage.aspx'] usrEId, username = '******', '0000' pwdEId, password = '******', '0000' brands = getBrands() rules = [ Rule(LinkExtractor(allow=(), tags=( 'a', 'area', 'button', 'div', ), attrs=( 'onclick', 'href', ), process_value=process_dealsClick), follow=False, callback='parse_item'), Rule(LinkExtractor(allow=(), tags=( 'a', 'area', 'button', 'div', ), attrs=( 'onclick', 'href', ), process_value=process_catNumberOnclick), follow=False, callback='parse_item'), Rule(LinkExtractor(allow=(), tags=( 'a', 'area', 'button', 'div', ), attrs=( 'onclick', 'href', ), process_value=process_catOrderOnclick), process_request=my_selenium_request_processor, follow=True), Rule(LinkExtractor(allow=("deals\.php\?filter")), process_request=my_selenium_request_processor, follow=True), ] def __init__(self, *args, **kwargs): super(BehatsdaSpider, self).__init__(*args, **kwargs) self.cycleid = kwargs.get('cycleid', '') def start_requests(self): yield scrapy.Request(self.login_url, callback=self.after_login, meta={ "selenium": True, "login": True, "elementId": "TextBoxPersonalNumber" }) def after_login(self, response): return super(BehatsdaSpider, self).start_requests() def parse_item(self, response): isValid = response.css( '#ctl00_ContentPlaceHolder2_LabelCategoryDescription' ).extract_first() is not None if isValid: title = response.css( '#ctl00_ContentPlaceHolder2_LabelCategoryName::text').get() description = response.css( '#ctl00_ContentPlaceHolder2_LabelCategoryDescription::text' ).get() + cleanString( response.css( '#ctl00_ContentPlaceHolder2_LabelCategoryText').extract()) yield CouponsItem( Title=title, supplier='998', brand=filterBrands( cleanString( response.css( '#ctl00_ContentPlaceHolder2_LabelCategoryName::text' ).get()), self.brands), JoinUrl=response.url, Description=description, ScrapeDate=datetime.now().strftime("%m/%d/%Y, %H:%M:%S"), DoorToDoorShipping=any(ext in (description + title) for ext in allowed_shipping_list), cyclerun=self.cycleid) parse_start_url = parse_item
class ImgSpider(CrawlSpider): name = 'teachersUnion' undetectable = False wait = False allowed_domains = ['igm.org.il'] encoding = 'utf-8' start_urls = ['https://www.igm.org.il/home_page.aspx?page=megalean_home/'] base_url = "https://www.igm.org.il/" brands = getBrands() rules = [ Rule(LinkExtractor(allow=(), attrs=( 'onclick', 'href', ), process_value=process_item), callback="parse_item"), Rule(LinkExtractor(allow=(), attrs=( 'onclick', 'href', ), process_value=process_couponOnclick), callback="parse_item"), Rule(LinkExtractor(allow=("search"), attrs=( 'onclick', 'href', ), tags=( "div", "a", "area", ), process_value=process_searchOnclick), follow=True), Rule(LinkExtractor(allow=(), attrs=( 'onclick', 'href', ), process_value=process_couponFollow), follow=True), Rule(LinkExtractor(allow=(), attrs=( 'onclick', 'href', ), process_value=process_nextPageOnclick), follow=True) ] def __init__(self, *args, **kwargs): super(ImgSpider, self).__init__(*args, **kwargs) self.cycleid = kwargs.get('cycleid', '') def start_requests(self): yield scrapy.Request( 'https://www.igm.org.il/home_page.aspx?page=igm_PNBS', callback=self.get_json, meta={'selenium': True}) def get_json(self, response): result = [ re.search(r"(.+?)\.json", i).group(0) for i in response.css("div.sidebarObject::attr(title)").extract() ] for uri in result: yield scrapy.Request('https://www.igm.org.il/ajax/' + uri, callback=self.parse_json) def parse_json(self, response): jsonStr = str(response.body, self.encoding) m = re.findall(r'"url":"(.+?)\.aspx\?(.+?)"', jsonStr) for uri in m: self.start_urls.append( urllib.parse.urljoin(self.base_url, '.aspx?'.join(uri))) return super(ImgSpider, self).start_requests() def parse_item(self, response): isValid = response.css( 'div.text_more_info_white.benefits').extract_first() is not None validCoupon = cleanString( response.css("h2.page-title::text").extract_first()) if isValid and validCoupon and (validCoupon != ""): description = cleanString(response.css("div#main").extract()) title = cleanString( response.css("h2.page-title::text").extract_first()) yield CouponsItem( Title=title, supplier='997', brand=filterBrands( cleanString( response.css("h2.page-title::text").extract_first()), self.brands), JoinUrl=response.url, Description=description, ScrapeDate=datetime.now().strftime("%m/%d/%Y, %H:%M:%S"), DoorToDoorShipping=any(ext in (description + title) for ext in allowed_shipping_list), cyclerun=self.cycleid) parse_start_url = parse_item
class MaxSpider(CrawlSpider): name = 'max' undetectable = False wait = False allowed_domains = ['max.co.il'] start_urls = [ 'https://www.max.co.il/he-il/Benefits/Pages/SummerBenefits.aspx' ] home = 'https://www.max.co.il/he-il/Benefits/Pages/SummerBenefits.aspx' brands = getBrands() starter = True rules = [ Rule(LinkExtractor(allow=('/anonymous/benefits')), callback='parse', process_request=my_selenium_request_processor, follow=True), Rule(LinkExtractor(allow=('/he-il/Benefits/(.+?)/Pages/(.+?)\.aspx')), callback='parse', process_request=my_selenium_request_processor, follow=True) ] def __init__(self, *args, **kwargs): super(MaxSpider, self).__init__(*args, **kwargs) self.cycleid = kwargs.get('cycleid', '') def parse(self, response): if self.starter: yield scrapy.Request(self.home, meta={'selenium': False}, callback=self.anonymous_scrape) self.starter = False isValid = response.css( '.benefitInfo_content').extract_first() is not None if isValid: if 'online.max.co.il' in response.url: description = cleanString( response.css("div.richHtml p").extract()) else: description = cleanString( response.css( "#ctl00_PlaceHolderMain_ctl00_divInitializeWrapperClass p" ).extract()) title = cleanString( response.css( "div.benefitInfo_content h2::text").extract_first()) yield CouponsItem( Title=title, supplier='100' if (re.search(r'/Biz/Pages/', response.url)) else '995' if (re.search(r'/BeyahadBishvilha/Pages/', response.url)) else '995', brand=filterBrands( cleanString( response.css("div.benefitInfo_content h1::text"). extract_first()), self.brands), JoinUrl=response.url, Description=description, ScrapeDate=datetime.now().strftime("%m/%d/%Y, %H:%M:%S"), DoorToDoorShipping=any(ext in (description + title) for ext in allowed_shipping_list), cyclerun=self.cycleid) def anonymous_scrape(self, response): match = re.findall( r"https://online\.max\.co\.il/anonymous/benefits/(.+?)&Catnumber=(\d{1,10})", response.body.decode("utf-8")) for i in match: yield scrapy.Request( 'https://online.max.co.il/anonymous/benefits/{}&Catnumber={}'. format(*i), callback=self.parse)
class IstudentSpider(scrapy.Spider): name = 'istudent' undetectable = False wait = False allowed_domains = ['istudent.co.il'] start_urls = ['http://istudent.co.il/'] brands = getBrands() def __init__(self, *args, **kwargs): super(IstudentSpider, self).__init__(*args, **kwargs) self.cycleid = kwargs.get('cycleid', '') def start_requests(self): for url in self.start_urls: yield scrapy.Request(url, callback=self.parse_brands, meta={'selenium': True}) def parse_brands(self, response): result = [ re.search(r"hotSaleByBrand\.php\?brandId=(.+)&name=(.+)", i).group(0) for i in response.css("a.to-sales::attr(href)").extract() ] for uri in result: yield scrapy.Request(urllib.parse.urljoin( 'https://istudent.co.il/', uri), callback=self.parse_products, meta={'selenium': True}) def parse_products(self, response): brandname = re.search(r"&name=(.+)", response.url).group(0)[6:] result = [ re.search(r"saleInner\.php\?saleId=(.+)", i).group(0) for i in response.css("a.to-sale::attr(href)").extract() if re.search(r"saleInner\.php\?saleId=(.+)", i) ] for uri in result: yield scrapy.Request(urllib.parse.urljoin( 'https://istudent.co.il/', uri), callback=self.parse_item, meta={ 'selenium': True, 'brandname': brandname }) def parse_item(self, response): description = cleanString(response.css("div.desc").extract()) title = cleanString( response.css("div.info-box h1.title").extract_first()) yield CouponsItem( Title=title, supplier='1', brand=filterBrands( urllib.parse.unquote(response.meta["brandname"]), self.brands), JoinUrl=response.url, Description=description, ScrapeDate=datetime.now().strftime("%m/%d/%Y, %H:%M:%S"), DoorToDoorShipping=any(ext in (description + title) for ext in allowed_shipping_list), cyclerun=self.cycleid)