Exemplo n.º 1
0
 def __init__(self, lang):
     self.lang = lang
     self.downloader = Downloader()
Exemplo n.º 2
0
 def __init__(self, page):
     self.downloader = Downloader()
     self.soup = BeautifulSoup(page)
Exemplo n.º 3
0
class ZaraScrape(Scraper):

    BRAND_NAME = 'Zara'
    PAGE_BASE = 'http://www.zara.com/fr/'

    def __init__(self, lang):
        self.lang = lang
        self.downloader = Downloader()

    def setConfig(self, section, subsection, productType, bodyPart):
        self.section = section
        self.subsection = subsection
        self.type = productType
        self.bodies = bodyPart

    '''
        Perfom the scraping on Zara website
    '''
    def run(self, usePlainImage = True, download = False):
        if download:
            self.dl_folder = self.DL_FOLDER_PATH_BASE + self.lang + '/' + self.section + '/' + self.subsection + '/'
            # Create folder if is not existing
            try:
                os.makedirs(self.dl_folder)
            except OSError as exception:
                if exception.errno != errno.EEXIST:
                    raise

        log.info('-- Starting scraping --')

        home = self.downloader.getFile(self.PAGE_BASE + self.lang + '/')
        browser = ZaraBrowser(home)

        # Goto first menu level
        url = browser.getMenuLinkFromName(self.section)
        try:
            browser.goTo(url, 5)
        except:
            log.warning("Unable to get the page '" + url + "'. Omitting.")
            return []

        # Goto second menu level
        url = browser.getMenuLinkFromName(self.subsection)
        try:
            browser.goTo(url, 5)
        except:
            log.warning("Unable to get the page '" + url + "'. Omitting.")
            return []

        # Start items parsing
        i = 0
        itemList = []
        for item in browser.getProductsList():
            log.debug('zzZZZZzzz')
            time.sleep(3) # let's do it cool

            # Goto the product page
            try:
                browser.goTo(item['url'])
            except:
                log.warning("FAIL : Unable to download '" + item['name'] + "'. Omitting.")
                continue

            imgUrl = browser.getProductImageLink(usePlainImage)
            if imgUrl is None:
                log.info('FAIL : Unable to get product image for "' + item['name'] + '". Omitting.')
                continue

            color = browser.getProductColor()

            imgFilename = str(i) + '-' + item['name'].replace(' ', '_')
            imgPath = self.BRAND_NAME + '/' + \
                self.lang + '/' + \
                self.section + '/' + \
                self.subsection + '/' + \
                imgFilename + '.jpg'

            # build a product object
            product = Product(item['name'], self.BRAND_NAME, color, imgUrl, imgPath, self.type, self.bodies)
            itemList.append(product)

            log.info(product.toString())

            # Downloading file if flag is True
            if download:
                log.info('Downloading ' + imgFilename + '...')
                self.downloader.writeFile(imgUrl, self.dl_folder + imgFilename)

            # count the number of object
            i += 1

        log.info('-- Ending scraping --')
        log.info('-- ' + str(i) + ' images was scraped --')

        return itemList
Exemplo n.º 4
0
class ZaraBrowser(Browser):

    '''
    @param page: Just a string with html code
    '''
    def __init__(self, page):
        self.downloader = Downloader()
        self.soup = BeautifulSoup(page)

    '''
    
    '''
    def goTo(self, url, timeRetrying = None):
        try:
            page = self.downloader.getFile(url, timeRetrying)
        except:
            raise
        else:
            self.soup = BeautifulSoup(page)

    '''
    Menu section parsing
    '''
    def getMenu(self, bSubmenu = False):
        if bSubmenu:
            menu = self.soup.find(id = 'mainNavigationMenu').find('ul', attrs = {'class': 'bSubmenu'})
        else:
            menu = self.soup.find(id = 'mainNavigationMenu')

        return menu

    def getMenuEntries(self, bSubmenu = False):
        menu = self.getMenu(bSubmenu)
        entries = menu.find_all('a')

        return entries

    def getMenuLinkFromName(self, name):
        menu = self.getMenu()
        link = menu.find('a', text = re.compile(r'\s+' + name, re.I)).get('href')

        return link

    '''
    Products section parsing
    '''
    def getProductsList(self):
        product_list = self.soup.find(id = 'product-list')
        product_list_info = product_list.find_all('div', attrs = {'class': 'product-info'})

        dummy = []
        for product in product_list_info:
            product_link = product.find('a')

            dummy.append({'name': product_link.get_text().lower(),
                          'url': product_link.get('href')})

        return dummy

    '''
    Product page parsing
    '''

    def getProductImageLink(self, usePlainImage):
        if usePlainImage:
            return self.getProductPlainImageLink()
        else:
            return self.getProductFullImageLink()

    '''
    @warning: May do not have a return value
    @return: 'plain' image or None
    '''
    def getProductPlainImageLink(self):
        container = self.soup.find('div', attrs = {'class': 'bigImageContainer'})

        try:
            imageSrc = container.find('div', attrs = {'class': 'plain'}) \
                                .find('img', attrs = {'class': 'image-big'}) \
                                .get('src')
        except AttributeError:
            log.warning('No "plain" image found for this product.')
        else:
            if not re.match('^http://', imageSrc, re.I):
                return 'http:' + imageSrc
            else:
                return imageSrc

    '''
    @warning: May do not have a return value
    @return: 'full' image or None
    '''
    def getProductFullImageLink(self):
        container = self.soup.find('div', attrs = {'class': 'bigImageContainer'})

        try:
            imageSrc = container.find('div', attrs = {'class': 'full'}) \
                                .find('img', attrs = {'class': 'image-big'}) \
                                .get('src')
        except AttributeError:
            log.warning('No "full" image found for this product.')
        else:
            if not re.match('^http://', imageSrc, re.I):
                return 'http:' + imageSrc
            else:
                return imageSrc

    def getProductColor(self):
        container = self.soup.find('form', attrs = {'name': 'itemAdd'}) \
                             .find('div', attrs = {'class': 'colors'}) \
                             .find('label', attrs = {'class': 'selected'})
        color_name = container.find('span').get_text()
        color_value = container.get('data-colorcode')

        return {'name': color_name, 'value': color_value}