Python getBrands示例，coupons.filters.getBrands Python示例

示例#1

0

显示文件

class UniqclubSpider(scrapy.Spider):
    name = 'uniqClub'
    undetectable = False
    wait = False
    allowed_domains = ['uniq-club.co.il']
    start_urls = ['http://uniq-club.co.il/']
    apiBase = 'https://www.uniq-club.co.il/discounts/ajax/{}'
    linkBase = 'https://www.uniq-club.co.il/discounts#{}'
    brands = getBrands()
    
    def __init__(self, *args, **kwargs):
        super(UniqclubSpider, self).__init__(*args, **kwargs)
        self.cycleid = kwargs.get('cycleid', '')

    def parse(self, response):
        for d in range(0, 1000):
            uri = self.apiBase.format(d)
            yield scrapy.Request(uri,
                                 callback=self.parse_coupon,
                                 method='POST', meta={'d_number':d})

    def parse_coupon(self, response):
        isValid = response.css('i.cafe_logo').extract_first() is not None
        if isValid:
            description = cleanString(response.css('span.small_text').extract())
            title=cleanString(response.css('div.richtext_div').extract_first())
            yield CouponsItem(Title=title,
                            supplier='990',
                            brand=filterBrands(cleanString(response.css('div.d_section > h1::text').get())[:-3],self.brands),
                            JoinUrl=self.linkBase.format(response.meta['d_number']),
                            Description=description,
                            ScrapeDate = datetime.now().strftime("%m/%d/%Y, %H:%M:%S"),
                            DoorToDoorShipping= any(ext in (description+title) for ext in allowed_shipping_list),
                            cyclerun=self.cycleid )

示例#2

0

显示文件

class IsracardSpider(CrawlSpider):
    name = 'isracard'
    undetectable = False
    wait = False
    allowed_domains = ['benefits.isracard.co.il']
    start_urls = ['https://benefits.isracard.co.il/']
    brands = getBrands()
    rules = [
        Rule(LinkExtractor(allow=(),tags=('div','a','area','button',),
                          attrs=('onclick', ),process_value=itemHandler),
             callback='parse',process_request=my_selenium_request_processor,follow=False),
        Rule(LinkExtractor(allow=(),tags=('div','a','area','button',),
                          attrs=('onclick', ),process_value=categoryHandler),process_request=my_selenium_request_processor,follow=True),
        Rule(LinkExtractor(allow=(),tags=('div','a','area','button',),
                          attrs=('onclick', ),process_value=mainHandler),callback='parse',process_request=my_selenium_request_processor,follow=True)
    ]

    def __init__(self, *args, **kwargs):
        super(IsracardSpider, self).__init__(*args, **kwargs)
        self.cycleid = kwargs.get('cycleid', '')   

    def parse(self, response):
        isValid = response.css('.benefit-details-txt').extract_first() is not None
        if isValid:
            description=cleanString(response.css("div.benefit-details-txt").extract())
            title=cleanString(response.css("div.benefit-info h1::text").extract_first())
            yield CouponsItem(Title=title,
                            supplier='996', 
                            brand=filterBrands(cleanString(response.css("div.benefit-info h1::text").extract_first()),self.brands),
                            JoinUrl=response.url,
                            Description=description,
                            ScrapeDate = datetime.now().strftime("%m/%d/%Y, %H:%M:%S"),
                            DoorToDoorShipping= any(ext in (description+title) for ext in allowed_shipping_list), 
                            cyclerun=self.cycleid )

示例#3

0

显示文件

class DreamcardSpider(scrapy.Spider):
    name = 'dreamcard'
    undetectable = False
    wait = False
    allowed_domains = ['dreamcard.co.il']
    start_urls = ['https://www.dreamcard.co.il/special-offers']
    brands = getBrands()

    def __init__(self, *args, **kwargs):
        super(DreamcardSpider, self).__init__(*args, **kwargs)
        self.cycleid = kwargs.get('cycleid', '')

    def parse(self, response):
        lst = response.css("nav.content ul.center li a").extract()
        for a in lst:
            soup = BeautifulSoup(a, 'lxml')
            input_tag = soup.find("a")
            href = input_tag.get('href')
            title = input_tag.get('title')
            img_tag = soup.find("img")
            desc = img_tag.get('src')
            m = re.search(
                r'https:\/\/www\.dreamcard\.co\.il\/special-offers\/', href)
            if m:
                yield CouponsItem(
                    Title=cleanString(title),
                    supplier='101',
                    brand=filterBrands(title, self.brands),
                    JoinUrl=href + "&&" + str(convertToNumber(title)),
                    Description=desc,
                    ScrapeDate=datetime.now().strftime("%m/%d/%Y, %H:%M:%S"),
                    DoorToDoorShipping=any(ext in (title)
                                           for ext in allowed_shipping_list),
                    cyclerun=self.cycleid)

示例#4

0

显示文件

文件： myofer.py 项目： alleneben/scrapy

class MyoferSpider(scrapy.Spider):
    name = 'myofer'
    undetectable = False
    wait = False
    allowed_domains = ['myofer.co.il']
    start_urls = ['http://myofer.co.il/']
    token = ''
    brands = getBrands()
    BASE_URL = 'https://api-mobile.myofer.co.il/v2/sales'

    def __init__(self, *args, **kwargs):
        super(MyoferSpider, self).__init__(*args, **kwargs)
        self.cycleid = kwargs.get('cycleid', '')

    def start_requests(self):
        for url in self.start_urls:
            yield scrapy.Request(url, callback=self.parse_malls,meta={'selenium':True,'myoferToken':True})
    
    def parse_malls(self, response):
        self.token = next(item for item in response.meta["cookieJar"] if item["name"] == "token").get('value')
        result = [re.search(r"/(.+)/category/all-benefits",i).group(0) for i in response.css("a.mall-name-expanded::attr(href)").extract() if re.search(r"/(.+)/category/all-benefits",i)]
        for uri in result:
            yield scrapy.Request(f'https://myofer.co.il{uri}',callback=self.parse_mallId)

    def parse_mallId(self, response):
        headers = {'Accept': 'Accept: application/json','Authorization':f'Bearer {self.token}'}
        idString = re.search(r"mallId=\d+&",str(response.body)).group(0) if re.search(r"mallId=\d+&",str(response.body)) else None
        if idString:
            d = re.search(r"\d+",idString).group(0) if re.search(r"\d+",idString) else None
            if d:
                mallname = re.search(r"/(.+)/",re.search(r"\.il/(.+)/c",str(response.url)).group(0)).group(0) if re.search(r"\.il/(.+)/c",str(response.url)) else None
                if mallname:
                    params = {
                            'mallId': d,
                            'limit': 100000
                        }
                    url = f'{self.BASE_URL}?{urllib.parse.urlencode(params)}'
                    yield scrapy.Request(url, callback=self.parse, headers=headers, meta={'mallName':mallname})
        
    def parse(self, response):
        result = json.loads(response.body)
        n = result.get('meta').get('totalitems')
        if n > 0:
            data = result.get('data')
            for r in data:
                for sale in r.get('attributes').get('sales'):
                    description = cleanString(sale['description'])
                    brandName = filterBrands(cleanString(sale['brand']['title']),self.brands)
                    title=cleanString(sale['title'])
                    title=f'{brandName} {title}'
                    yield CouponsItem(Title=title,
                                    supplier='2',
                                    brand= brandName,
                                    JoinUrl= f"https://myofer.co.il{response.meta['mallName']}brands/{sale['brand']['seoname']}/{sale['id']}",
                                    Description=description,
                                    ScrapeDate = datetime.now().strftime("%m/%d/%Y, %H:%M:%S"),
                                    DoorToDoorShipping= any(ext in (description+title) for ext in allowed_shipping_list),
                                    cyclerun=self.cycleid )

示例#5

0

显示文件

文件： diners.py 项目： alleneben/scrapy

class DinersSpider(CrawlSpider):
    name = 'diners'
    undetectable = True
    wait = True
    elementId = 'cal-shop-brand'
    allowed_domains = ['diners-store.co.il']
    start_urls = ['https://www.diners-store.co.il/']
    brands = getBrands()
    integrator = '-כותרת משנה'
    rules = [
        Rule(LinkExtractor(allow=(), process_value=itemHandler),
             callback='parse',
             process_request=my_selenium_request_processor,
             follow=False),
        Rule(LinkExtractor(allow=(), process_value=categoryHandler),
             process_request=my_selenium_request_processor,
             follow=True)
    ]

    def __init__(self, *args, **kwargs):
        super(DinersSpider, self).__init__(*args, **kwargs)
        self.cycleid = kwargs.get('cycleid', '')

    def parse(self, response):
        description = cleanString(
            response.css("div#full-description-text").extract())
        if not description:
            description = cleanString(
                response.css("div.banner-club-big-text-box").extract())
        greenbox = cleanString(
            response.css("h1.productTitle").extract()) + cleanString(
                response.css("div.productSubTitle").extract())
        big_redbox = cleanString(
            response.css("td.product-list-checkboxes").extract())
        if similar(greenbox, big_redbox) > 0.9:
            title = greenbox
        else:
            low_price = re.search(r"'PriceDiscount':\s'\d{1,}'",
                                  str(response.body)).group(0) if re.search(
                                      r"'PriceDiscount':\s'\d{1,}'",
                                      str(response.body)) else ''
            title = greenbox + self.integrator + big_redbox + low_price.replace(
                "'PriceDiscount':", '')
        yield CouponsItem(
            Title=title,
            supplier='16',
            brand=filterBrands(
                cleanString(response.css("h1.productTitle").extract()),
                self.brands),
            JoinUrl=response.url,
            Description=description,
            ScrapeDate=datetime.now().strftime("%m/%d/%Y, %H:%M:%S"),
            DoorToDoorShipping=any(ext in (description + title)
                                   for ext in allowed_shipping_list),
            cyclerun=self.cycleid)

示例#6

0

显示文件

class OgenSpider(CrawlSpider):
    name = 'ogen'
    undetectable = False
    wait = False
    allowed_domains = ['ogen.org.il']
    start_urls = ['https://ogen.org.il/']
    brands = getBrands()
    # ajax_url = 'https://ogen.org.il/wp-admin/admin-ajax.php'
    # payload = "------WebKitFormBoundary7MA4YWxkTrZu0gW\r\nContent-Disposition: form-data; name=\"action\"\r\n\r\nmatat_filter\r\n------WebKitFormBoundary7MA4YWxkTrZu0gW\r\nContent-Disposition: form-data; name=\"data\"\r\n\r\nminPrice=0&maxPrice=1000\r\n------WebKitFormBoundary7MA4YWxkTrZu0gW\r\nContent-Disposition: form-data; name=\"security\"\r\n\r\ncb03a93ccd\r\n------WebKitFormBoundary7MA4YWxkTrZu0gW--"
    # headers = {
    # 'content-type': "multipart/form-data; boundary=----WebKitFormBoundary7MA4YWxkTrZu0gW",
    # 'cache-control': "no-cache",
    # 'User-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/84.0.4147.135 Safari/537.36'
    # }
    rules = [
        Rule(
            LinkExtractor(allow=('/product-category/')),
            follow=True,
        ),
        Rule(LinkExtractor(allow=('/product/')), callback='parse')
    ]

    # def start_requests(self):
    #     yield scrapy.Request(self.start_urls[0], callback=self.ajax_parse)

    # def ajax_parse(self, response):
    #     result = requests.request("POST", self.ajax_url, data=self.payload, headers=self.headers)
    #     response = HtmlResponse(self.ajax_url, body=result.text, encoding='utf-8')
    #     products = [i for i in response.css("a::attr(href)").extract() if re.search(r"/product/",i)]
    #     for links in products:
    #         yield scrapy.Request(links, callback=self.parse)
    #     return super(OgenSpider, self).start_requests()

    def __init__(self, *args, **kwargs):
        super(OgenSpider, self).__init__(*args, **kwargs)
        self.cycleid = kwargs.get('cycleid', '')

    def parse(self, response):
        description = cleanString(
            response.css("div.short-info p::text").getall())
        title = cleanString(
            response.css("h2.product-name::text").get()) + cleanString(
                response.css("div.price").extract())
        yield CouponsItem(
            Title=title,
            supplier='992',
            brand=filterBrands(
                cleanString(response.css("h2.product-name::text").get()),
                self.brands),
            JoinUrl=response.url,
            Description=description,
            ScrapeDate=datetime.now().strftime("%m/%d/%Y, %H:%M:%S"),
            DoorToDoorShipping=any(ext in (description + title)
                                   for ext in allowed_shipping_list),
            cyclerun=self.cycleid)

示例#7

0

显示文件

文件： paisplus.py 项目： alleneben/scrapy

class PaisplusSpider(CrawlSpider):
    name = 'pais'
    undetectable = True
    wait = True
    elementId = 'accesability_container'
    allowed_domains = ['paisplus.co.il']
    apiBase = 'https://data.dolcemaster.co.il'
    start_urls = ['https://paisplus.co.il/']
    siteUuid = 'BBAD629F-E549-4612-9EAE-3AA9E85F1C33'
    linkBase = 'https://www.paisplus.co.il/benefits/'
    getBenefitDetails = urllib.parse.urljoin(
        apiBase, f'api/v5_1/public/benefits_details')
    headers = {'Accept': 'Accept: application/json'}
    brands = getBrands()

    rules = [
        Rule(LinkExtractor(allow=('/category/')),
             process_request=my_selenium_request_processor,
             follow=True),
        Rule(LinkExtractor(allow=('/benefits/')),
             callback='parse',
             process_request=my_selenium_request_processor,
             follow=False)
    ]

    def __init__(self, *args, **kwargs):
        super(PaisplusSpider, self).__init__(*args, **kwargs)
        self.cycleid = kwargs.get('cycleid', '')

    def parse(self, response):
        m = re.search(r'https://www\.paisplus\.co\.il/benefits/(.+)/',
                      response.url)
        if m:
            benefit_id = m.group(1)
            formdata = {
                'club_id': f'{self.siteUuid}',
                'benefits_id': f'{benefit_id}'
            }
            r = requests.post(self.getBenefitDetails, json=formdata)
            if r.status_code == 200:
                data = r.json().get('benefits')[0]
                description = cleanString(data['benefits_description'])
                title = cleanString(data['benefits_name'])
                yield CouponsItem(
                    Title=title,
                    supplier='991',
                    brand=filterBrands(description, self.brands),
                    JoinUrl=self.linkBase + data['benefits_id'],
                    Description=description,
                    ScrapeDate=datetime.now().strftime("%m/%d/%Y, %H:%M:%S"),
                    DoorToDoorShipping=any(ext in (description + title)
                                           for ext in allowed_shipping_list),
                    cyclerun=self.cycleid)

示例#8

0

显示文件

class HvrSpider(CrawlSpider):
    name = 'hvr'
    undetectable = True
    elementId = 'wrap'
    wait = True
    allowed_domains = ['hvr.co.il']
    start_urls = []
    signin_url = 'https://hvr.co.il/signin.aspx'
    usrEId,username = '******','052046133'
    pwdEId,password = '******','5167722'
    brands = getBrands()
    headers = {'User-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/84.0.4147.135 Safari/537.36'}
    rules = [
        Rule(LinkExtractor(allow=(),tags=('div','a','area','button'),attrs=('data-item_id',),process_value=process_item_id),callback="parse_item",follow=False),
        Rule(LinkExtractor(allow=(),tags=('div','a','area','button'),attrs=('href',),process_value=process_item_href),callback="parse_item",follow=False),
        Rule(LinkExtractor(allow=(),tags=('div','a','area','button'),attrs=('href',),process_value=process_lst),callback="parse_lst",follow=True),
        Rule(LinkExtractor(allow=(),tags=('div','a','area','button'),attrs=('href',),process_value=process_cat),follow=True)
    ]
    

    def start_requests(self):
        yield scrapy.Request(self.signin_url, callback=self.after_login,meta={"selenium":True,"login":True,"elementId":"tz"})

    def after_login(self,response):
        result = [re.search(r"(.+?)\.json",i).group(0) for i in response.css("div::attr(title)").extract() if re.search(r"(.+?)\.json",i)]
        result2 = [re.search(r"(.+?)\.json",i).group(0) for i in response.css("div::attr(data-json)").extract() if re.search(r"(.+?)\.json",i)]
        for uri in result+result2:
            uri = re.search(r"(?<=\\).*",uri).group(0) if re.search(r"(?<=\\).*",uri) else uri
            r = requests.get('https://www.hvr.co.il/ajax/'+uri,headers=self.headers)
            m = re.findall(r"(page|url)':\s?'(.+?)'",str(r.json())) if r.status_code == 200 else None
            if m:
                for t,s in m:
                    n = re.search(r"(?=home_page\.aspx).*", s)
                    if t == 'page':
                        url = 'https://www.hvr.co.il/home_page.aspx?page=' + s
                    elif t == 'url' and n:
                        url = 'https://www.hvr.co.il/'+n.group(0)
                    self.start_urls.append(url)
        self.start_urls.append(response.url)
        return super(HvrSpider, self).start_requests()


    def parse_lst(self, response):
        template_links = re.findall(r'template_link:\s?"(.+)\d{5,8}"',str(response.body))
        for uri in template_links:
            yield scrapy.Request(urllib.parse.urljoin('https://www.hvr.co.il/',uri), callback=self.parse_item)

    def parse_item(self, response):
        print(response.url)

    
    parse_start_url = parse_item

示例#9

0

显示文件

class TovSpider(CrawlSpider):
    name = 'tov'
    undetectable = True
    wait = False
    allowed_domains = ['tov.org.il']
    start_urls = []
    signin_url = 'https://www.tov.org.il/signin.aspx'
    usrEId, username = '******', '025190273'
    pwdEId, password = '******', '2535271'
    brands = getBrands()
    rules = [
        Rule(LinkExtractor(allow=(),
                           tags=('div', 'a', 'area', 'button'),
                           attrs=('data-href', 'href', 'onclick'),
                           process_value=itemHandler),
             callback='parse_item',
             process_request=my_selenium_request_processor,
             follow=False),
        Rule(LinkExtractor(allow=(),
                           tags=('div', 'a', 'area', 'button'),
                           attrs=('data-href', 'href', 'onclick'),
                           process_value=categoryHandler),
             process_request=my_selenium_request_processor,
             follow=True)
    ]

    def start_requests(self):
        yield scrapy.Request(self.signin_url,
                             callback=self.after_login,
                             meta={
                                 "selenium": True,
                                 "login": True
                             })

    def after_login(self, response):
        self.start_urls.append(response.url)
        return super(TovSpider, self).start_requests()

    def parse_item(self, response):
        with open('log.txt', 'a') as f:
            f.write(response.url + '\n')

    parse_start_url = parse_item

示例#10

0

显示文件

class MegaleanSpider(CrawlSpider):
    name = 'megalean'
    undetectable = False
    wait = False
    allowed_domains = ['megalean.co.il']
    start_urls = ['https://www.megalean.co.il/site/pg/home']
    brands = getBrands()
    rules = [
        Rule(LinkExtractor(allow=(),
                           attrs=('onclick', ),
                           process_value=process_item),
             callback='parse',
             follow=True),
        Rule(LinkExtractor(allow=(),
                           tags=('div', ),
                           attrs=('data-href', ),
                           process_value=process_divLinks),
             process_request=my_selenium_request_processor,
             follow=True),
        Rule(LinkExtractor(allow=('/search/', '/cat_')),
             process_request=my_selenium_request_processor,
             follow=True),
    ]

    def __init__(self, *args, **kwargs):
        super(MegaleanSpider, self).__init__(*args, **kwargs)
        self.cycleid = kwargs.get('cycleid', '')

    def parse(self, response):
        description = cleanString(
            response.css('ul.product-info li:nth-child(6)').extract())
        title = cleanString(response.css("#ptitle::text").get())
        yield CouponsItem(
            Title=title,
            supplier='994',
            brand=filterBrands(
                cleanString(response.css("#ptitle::text").get()), self.brands),
            JoinUrl=response.url,
            Description=description,
            ScrapeDate=datetime.now().strftime("%m/%d/%Y, %H:%M:%S"),
            DoorToDoorShipping=any(ext in (description + title)
                                   for ext in allowed_shipping_list),
            cyclerun=self.cycleid)

示例#11

0

显示文件

文件： behatsda.py 项目： alleneben/scrapy

class BehatsdaSpider(CrawlSpider):
    name = 'behatsda'
    undetectable = True
    wait = True
    elementId = 'aspnetForm'
    allowed_domains = ['behatsdaa.org.il']
    login_url = 'https://behatsdaa.org.il/'
    start_urls = ['https://www.behatsdaa.org.il/HomePage.aspx']
    usrEId, username = '******', '0000'
    pwdEId, password = '******', '0000'
    brands = getBrands()

    rules = [
        Rule(LinkExtractor(allow=(),
                           tags=(
                               'a',
                               'area',
                               'button',
                               'div',
                           ),
                           attrs=(
                               'onclick',
                               'href',
                           ),
                           process_value=process_dealsClick),
             follow=False,
             callback='parse_item'),
        Rule(LinkExtractor(allow=(),
                           tags=(
                               'a',
                               'area',
                               'button',
                               'div',
                           ),
                           attrs=(
                               'onclick',
                               'href',
                           ),
                           process_value=process_catNumberOnclick),
             follow=False,
             callback='parse_item'),
        Rule(LinkExtractor(allow=(),
                           tags=(
                               'a',
                               'area',
                               'button',
                               'div',
                           ),
                           attrs=(
                               'onclick',
                               'href',
                           ),
                           process_value=process_catOrderOnclick),
             process_request=my_selenium_request_processor,
             follow=True),
        Rule(LinkExtractor(allow=("deals\.php\?filter")),
             process_request=my_selenium_request_processor,
             follow=True),
    ]

    def __init__(self, *args, **kwargs):
        super(BehatsdaSpider, self).__init__(*args, **kwargs)
        self.cycleid = kwargs.get('cycleid', '')

    def start_requests(self):
        yield scrapy.Request(self.login_url,
                             callback=self.after_login,
                             meta={
                                 "selenium": True,
                                 "login": True,
                                 "elementId": "TextBoxPersonalNumber"
                             })

    def after_login(self, response):
        return super(BehatsdaSpider, self).start_requests()

    def parse_item(self, response):
        isValid = response.css(
            '#ctl00_ContentPlaceHolder2_LabelCategoryDescription'
        ).extract_first() is not None
        if isValid:
            title = response.css(
                '#ctl00_ContentPlaceHolder2_LabelCategoryName::text').get()
            description = response.css(
                '#ctl00_ContentPlaceHolder2_LabelCategoryDescription::text'
            ).get() + cleanString(
                response.css(
                    '#ctl00_ContentPlaceHolder2_LabelCategoryText').extract())
            yield CouponsItem(
                Title=title,
                supplier='998',
                brand=filterBrands(
                    cleanString(
                        response.css(
                            '#ctl00_ContentPlaceHolder2_LabelCategoryName::text'
                        ).get()), self.brands),
                JoinUrl=response.url,
                Description=description,
                ScrapeDate=datetime.now().strftime("%m/%d/%Y, %H:%M:%S"),
                DoorToDoorShipping=any(ext in (description + title)
                                       for ext in allowed_shipping_list),
                cyclerun=self.cycleid)

    parse_start_url = parse_item

示例#12

0

显示文件

class ImgSpider(CrawlSpider):
    name = 'teachersUnion'
    undetectable = False
    wait = False
    allowed_domains = ['igm.org.il']
    encoding = 'utf-8'
    start_urls = ['https://www.igm.org.il/home_page.aspx?page=megalean_home/']
    base_url = "https://www.igm.org.il/"
    brands = getBrands()
    rules = [
        Rule(LinkExtractor(allow=(),
                           attrs=(
                               'onclick',
                               'href',
                           ),
                           process_value=process_item),
             callback="parse_item"),
        Rule(LinkExtractor(allow=(),
                           attrs=(
                               'onclick',
                               'href',
                           ),
                           process_value=process_couponOnclick),
             callback="parse_item"),
        Rule(LinkExtractor(allow=("search"),
                           attrs=(
                               'onclick',
                               'href',
                           ),
                           tags=(
                               "div",
                               "a",
                               "area",
                           ),
                           process_value=process_searchOnclick),
             follow=True),
        Rule(LinkExtractor(allow=(),
                           attrs=(
                               'onclick',
                               'href',
                           ),
                           process_value=process_couponFollow),
             follow=True),
        Rule(LinkExtractor(allow=(),
                           attrs=(
                               'onclick',
                               'href',
                           ),
                           process_value=process_nextPageOnclick),
             follow=True)
    ]

    def __init__(self, *args, **kwargs):
        super(ImgSpider, self).__init__(*args, **kwargs)
        self.cycleid = kwargs.get('cycleid', '')

    def start_requests(self):
        yield scrapy.Request(
            'https://www.igm.org.il/home_page.aspx?page=igm_PNBS',
            callback=self.get_json,
            meta={'selenium': True})

    def get_json(self, response):
        result = [
            re.search(r"(.+?)\.json", i).group(0)
            for i in response.css("div.sidebarObject::attr(title)").extract()
        ]
        for uri in result:
            yield scrapy.Request('https://www.igm.org.il/ajax/' + uri,
                                 callback=self.parse_json)

    def parse_json(self, response):
        jsonStr = str(response.body, self.encoding)
        m = re.findall(r'"url":"(.+?)\.aspx\?(.+?)"', jsonStr)
        for uri in m:
            self.start_urls.append(
                urllib.parse.urljoin(self.base_url, '.aspx?'.join(uri)))
        return super(ImgSpider, self).start_requests()

    def parse_item(self, response):
        isValid = response.css(
            'div.text_more_info_white.benefits').extract_first() is not None
        validCoupon = cleanString(
            response.css("h2.page-title::text").extract_first())
        if isValid and validCoupon and (validCoupon != ""):
            description = cleanString(response.css("div#main").extract())
            title = cleanString(
                response.css("h2.page-title::text").extract_first())
            yield CouponsItem(
                Title=title,
                supplier='997',
                brand=filterBrands(
                    cleanString(
                        response.css("h2.page-title::text").extract_first()),
                    self.brands),
                JoinUrl=response.url,
                Description=description,
                ScrapeDate=datetime.now().strftime("%m/%d/%Y, %H:%M:%S"),
                DoorToDoorShipping=any(ext in (description + title)
                                       for ext in allowed_shipping_list),
                cyclerun=self.cycleid)

    parse_start_url = parse_item

示例#13

0

显示文件

class MaxSpider(CrawlSpider):
    name = 'max'
    undetectable = False
    wait = False
    allowed_domains = ['max.co.il']
    start_urls = [
        'https://www.max.co.il/he-il/Benefits/Pages/SummerBenefits.aspx'
    ]
    home = 'https://www.max.co.il/he-il/Benefits/Pages/SummerBenefits.aspx'
    brands = getBrands()
    starter = True
    rules = [
        Rule(LinkExtractor(allow=('/anonymous/benefits')),
             callback='parse',
             process_request=my_selenium_request_processor,
             follow=True),
        Rule(LinkExtractor(allow=('/he-il/Benefits/(.+?)/Pages/(.+?)\.aspx')),
             callback='parse',
             process_request=my_selenium_request_processor,
             follow=True)
    ]

    def __init__(self, *args, **kwargs):
        super(MaxSpider, self).__init__(*args, **kwargs)
        self.cycleid = kwargs.get('cycleid', '')

    def parse(self, response):
        if self.starter:
            yield scrapy.Request(self.home,
                                 meta={'selenium': False},
                                 callback=self.anonymous_scrape)
        self.starter = False

        isValid = response.css(
            '.benefitInfo_content').extract_first() is not None
        if isValid:
            if 'online.max.co.il' in response.url:
                description = cleanString(
                    response.css("div.richHtml p").extract())
            else:
                description = cleanString(
                    response.css(
                        "#ctl00_PlaceHolderMain_ctl00_divInitializeWrapperClass p"
                    ).extract())
            title = cleanString(
                response.css(
                    "div.benefitInfo_content h2::text").extract_first())
            yield CouponsItem(
                Title=title,
                supplier='100' if
                (re.search(r'/Biz/Pages/', response.url)) else '995' if
                (re.search(r'/BeyahadBishvilha/Pages/',
                           response.url)) else '995',
                brand=filterBrands(
                    cleanString(
                        response.css("div.benefitInfo_content h1::text").
                        extract_first()), self.brands),
                JoinUrl=response.url,
                Description=description,
                ScrapeDate=datetime.now().strftime("%m/%d/%Y, %H:%M:%S"),
                DoorToDoorShipping=any(ext in (description + title)
                                       for ext in allowed_shipping_list),
                cyclerun=self.cycleid)

    def anonymous_scrape(self, response):
        match = re.findall(
            r"https://online\.max\.co\.il/anonymous/benefits/(.+?)&Catnumber=(\d{1,10})",
            response.body.decode("utf-8"))
        for i in match:
            yield scrapy.Request(
                'https://online.max.co.il/anonymous/benefits/{}&Catnumber={}'.
                format(*i),
                callback=self.parse)

示例#14

0

显示文件

文件： istudent.py 项目： alleneben/scrapy

class IstudentSpider(scrapy.Spider):
    name = 'istudent'
    undetectable = False
    wait = False
    allowed_domains = ['istudent.co.il']
    start_urls = ['http://istudent.co.il/']
    brands = getBrands()

    def __init__(self, *args, **kwargs):
        super(IstudentSpider, self).__init__(*args, **kwargs)
        self.cycleid = kwargs.get('cycleid', '')

    def start_requests(self):
        for url in self.start_urls:
            yield scrapy.Request(url,
                                 callback=self.parse_brands,
                                 meta={'selenium': True})

    def parse_brands(self, response):
        result = [
            re.search(r"hotSaleByBrand\.php\?brandId=(.+)&name=(.+)",
                      i).group(0)
            for i in response.css("a.to-sales::attr(href)").extract()
        ]
        for uri in result:
            yield scrapy.Request(urllib.parse.urljoin(
                'https://istudent.co.il/', uri),
                                 callback=self.parse_products,
                                 meta={'selenium': True})

    def parse_products(self, response):
        brandname = re.search(r"&name=(.+)", response.url).group(0)[6:]
        result = [
            re.search(r"saleInner\.php\?saleId=(.+)", i).group(0)
            for i in response.css("a.to-sale::attr(href)").extract()
            if re.search(r"saleInner\.php\?saleId=(.+)", i)
        ]
        for uri in result:
            yield scrapy.Request(urllib.parse.urljoin(
                'https://istudent.co.il/', uri),
                                 callback=self.parse_item,
                                 meta={
                                     'selenium': True,
                                     'brandname': brandname
                                 })

    def parse_item(self, response):
        description = cleanString(response.css("div.desc").extract())
        title = cleanString(
            response.css("div.info-box h1.title").extract_first())
        yield CouponsItem(
            Title=title,
            supplier='1',
            brand=filterBrands(
                urllib.parse.unquote(response.meta["brandname"]), self.brands),
            JoinUrl=response.url,
            Description=description,
            ScrapeDate=datetime.now().strftime("%m/%d/%Y, %H:%M:%S"),
            DoorToDoorShipping=any(ext in (description + title)
                                   for ext in allowed_shipping_list),
            cyclerun=self.cycleid)