示例#1
0
 def parse_page(self, driver, url):
     product = {'url': url,
                'brand': ProductDgSpider.brand,
                'code': '',
                'price': 0,
                'images': '',
                'detail': ''}
     # check 404
     element = utils.find_element_by_css_selector(driver, 'div.b-error_page-wrapper')
     if element:
         raise Exception('404 page not found')
     # Switch to CHN
     element = utils.find_elements_by_css_selector(driver, 'ul.l-header_service_menu > li > span')[0]
     driver.execute_script('arguments[0].click();', element)
     utils.sleep(1)
     element = utils.find_elements_by_css_selector(driver, 'ul.b-language_selector-language_list > li > a')[-1]
     driver.execute_script('arguments[0].click();', element)
     utils.sleep(1)
     # title
     element = utils.find_element_by_css_selector(driver, 'span.b-product_name')
     if element:
         product['title'] = element.text.strip()
     else:
         raise Exception('Title not found for %s' % driver.current_url)
     # code
     element = utils.find_element_by_css_selector(driver, 'div.b-product_master_id')
     product['code'] = element.get_attribute('innerHTML').split(':')[1].strip()
     # unit
     product['unit'] = config.rmb
     # price
     element = utils.find_element_by_css_selector(driver, 'h4.b-product_price-standard')
     if element:
         price_text = element.text.strip()[1:].replace(',', '')
         try:
             product['price'] = float(price_text)
         except:
             pass # 有那么几件商品没价格
     # images
     images = []
     elements = utils.find_elements_by_css_selector(driver, 'div.js-thumbnails_slider > ul.js-thumbnails > li > img')
     for element in elements:
         text = element.get_attribute('src').strip()
         images.append(text.split('?')[0])
     product['images'] = ';'.join(images)
     # detail
     element = utils.find_element_by_css_selector(driver, 'div.b-product_long_description')
     text = element.get_attribute('innerHTML')
     text = text[1:] # Remove the first '\n'
     text = text.replace('amp;', '')
     text = text.replace('。', '。\n')
     text = text.replace('<i>', '')
     text = text.replace('</i>', '')
     text = text.replace('<br>', '')
     text = text.replace('• ', '\n• ')
     product['detail'] = text[:-2] # Remove the tailing '\n'
     return product
示例#2
0
 def parse_page(self, driver, url):
     products = []
     elements = utils.find_elements_by_css_selector(
         driver,
         'div.productgridItem > ul.prod_style > li[style="display:block"]  > div.prod_grid > a'
     )
     for element in elements:
         products.append(element.get_attribute('href').strip())
     return ';'.join(products)
 def parse_page(self, driver, url):
     products = []
     driver.get(url)
     product = {
         'url': url,
         'brand': ProductStuartweitzmanSpider.brand,
         'code': '',
         'price': 0,
         'images': '',
         'detail': ''
     }
     # check 404
     element = utils.find_element_by_css_selector(
         driver, 'div.information_message.negative > p')
     if element:
         if element.text.strip() == '404 页面未找到':
             raise Exception('404 page not found')
     # title
     element = utils.find_element_by_css_selector(driver,
                                                  'h1.pdname > span')
     if element:
         product['title'] = element.text.strip()
     else:
         raise Exception('Title not found for %s' % driver.current_url)
     # code/NA
     # unit
     product['unit'] = config.rmb
     # price
     element = utils.find_element_by_css_selector(driver, 'p.big-price')
     if element:
         price_text = element.text.strip()
         # 打折的情况需要处理
         if '|' in price_text:  # 打折
             price_text = price_text.split('|')[1].strip().split(' ')[0][1:]
         else:
             price_text = price_text[1:-2].strip()
         price_text = price_text.replace(',', '')
         product['price'] = float(price_text)
     # images
     elements = utils.find_elements_by_css_selector(
         driver, 'ul#carousel_alternate > li > span > a > img')
     images = [
         element.get_attribute('data-primaryimagesrc').strip()
         for element in elements
     ]
     product['images'] = ';'.join(images)
     # detail
     element = utils.find_element_by_css_selector(driver,
                                                  'div.pdp-description')
     if element:
         product['detail'] = element.text.strip()
     return product
示例#4
0
 def parse_page(self, driver, url):
     products = []
     product_count = 0
     while True:
         elements = utils.find_elements_by_css_selector(driver, 'div.l-product_tiles > div.js-product_tile > div > a.js-producttile_link')
         if len(elements) > product_count:
             product_count = len(elements)
             driver.execute_script('window.scrollBy(0, document.body.scrollHeight);')
             utils.sleep(1)
         else:
             break
     for element in elements:
         products.append(element.get_attribute('href').strip())
     return ';'.join(products)
示例#5
0
 def parse_page(self, driver, url):
     products = []
     product = {
         'url': url,
         'brand': ProductFerragamoSpider.brand,
         'code': '',
         'price': 0,
         'images': '',
         'detail': ''
     }
     # check 404
     element = utils.find_element_by_css_selector(driver, 'div.nofound')
     if element:
         raise Exception('404 page not found')
     # title
     element = utils.find_element_by_css_selector(
         driver, 'div.dpd-main__details__head > div > h1.dpd-main__name')
     if element:
         product['title'] = element.text.strip()
     else:
         raise Exception('Title not found for %s' % driver.current_url)
     # code
     element = utils.find_element_by_css_selector(
         driver, 'div.dpd-main__details__head > div > div.dpd-main__sku')
     if element:
         product['code'] = ' '.join(element.text.strip().split(' ')[1:])
     # unit
     product['unit'] = config.rmb
     # price
     element = utils.find_element_by_css_selector(
         driver, 'div.dpd-main__details__head > div > div.dpd-main__price')
     if element:
         product['price'] = float(element.text.strip().split(' ')[1])
     # images
     elements = utils.find_elements_by_css_selector(
         driver, 'div.dpd-visuals > div > a > img')
     images = [element.get_attribute('src').strip() for element in elements]
     product['images'] = ';'.join(images)
     # detail
     element = utils.find_element_by_css_selector(driver,
                                                  'div.dpd-info__body')
     if element:
         product['detail'] = element.get_attribute('innerHTML').strip()
     return product
示例#6
0
 def parse_page(self, driver, url):
     # 点击展开分页
     element = utils.find_element_by_css_selector(
         driver, 'li.view-all-products > span')
     if element:
         driver.execute_script('arguments[0].click();', element)
     # 下拉刷新
     products = []
     product_count = 0
     while True:
         elements = utils.find_elements_by_css_selector(
             driver, 'div.product-tile > figure > a.thumb-link')
         if len(elements) > product_count:
             product_count = len(elements)
             driver.execute_script(
                 'window.scrollBy(0, document.body.scrollHeight);')
             utils.sleep(1)
         else:
             break
     for element in elements:
         products.append(element.get_attribute('href').strip())
     return ';'.join(products)
示例#7
0
 def parse_page(self, driver, url):
     products = []
     product = {'url': url,
                'brand': ProductLoeweSpider.brand,
                'code': '',
                'price': 0,
                'images': '',
                'detail': ''}
     # 切换语言
     element = utils.find_element_by_css_selector(driver, 'div.siteSelectors-current.siteSelectors-current-locale')
     if element:
         driver.execute_script('arguments[0].click();', element)
         utils.sleep(1)
         elements = utils.find_elements_by_css_selector(driver, 'ul.siteSelectors-list-locale[data-country=CN] > li > a')
         switch_to_chinese = False
         for element in elements:
             if element.get_attribute('innerHTML').strip() == 'Simplified Chinese':
                 driver.execute_script('arguments[0].click();', element)
                 utils.sleep(1)
                 switch_to_chinese = True
                 break
         if not switch_to_chinese:
             raise Exception("Can't switch to Chinese")
     else:
         raise Exception("Cant't select language")
     # check 404
     element = utils.find_element_by_css_selector(driver, 'div.error404')
     if element:
         raise Exception('404 page not found')
     # title
     element = utils.find_element_by_css_selector(driver, 'h1.product-name')
     if element:
         product['title'] = element.text.strip()
     else:
         raise Exception('Title not found for %s' % driver.current_url)
     # code
     element = utils.find_element_by_css_selector(driver, 'span.model-id')
     if element:
         product['code'] = element.text.strip().split(': ')[1]
     # unit
     product['unit'] = config.rmb
     # price
     element = utils.find_element_by_css_selector(driver, 'div.price-and-size-wrapper > div.product-price > span.price-sales')
     if element:
         if element.text.strip() != '不适用':
             price_text = element.text.strip()[1:].replace(',', '')
             product['price'] = float(price_text)
     # images
     elements = utils.find_elements_by_css_selector(driver, 'ul.product-thumbnails-list > li > a')
     images = [element.get_attribute('href').strip() for element in elements]
     product['images'] = ';'.join(images)
     # detail
     details = []
     pattern = re.compile(r'<[^>]+>',re.S)
     elements = utils.find_elements_by_css_selector(driver, 'ul.details-col-1 > li')
     for element in elements:
         text = element.get_attribute('innerHTML').strip()
         text = pattern.sub('', text) # 去除HTML标签
         text = text.replace('\n\t\t', '')
         details.append(text)
     elements = utils.find_elements_by_css_selector(driver, 'ul.details-col-2 > li')
     for element in elements:
         text = element.get_attribute('innerHTML').strip()
         text = pattern.sub('', text)
         text = text.replace('\n\t\t', '')
         details.append(text)
     product['detail'] = '\n'.join(details)
     return product