Пример #1
0
    def parseProductTitle(self, item, ad):
        title = item.find('h2')
        if not title:
            raise utility.CrawlerError('Product title is missing')

        if title.get('data-attribute') is None:
            raise utility.CrawlerError('Product title is missing')

        ad['title'] = title['data-attribute']
Пример #2
0
    def parseProductLink(self, item, ad):
        href = item.find('a', class_='a-link-normal a-text-normal')
        if not href:
            raise utility.CrawlerError('Product format not correct', 0)

        href = href.get('href')
        href = utility.normalizeUrl(href)
        ad['detail_url'] = href

        if href in self.visitedUrl:
            raise utility.CrawlerError('Product link already exists')
        else:
            self.visitedUrl.add(href)
Пример #3
0
    def parseProductBrand(self, item, ad):
        brand = item.find_all('span', class_='a-size-small a-color-secondary')

        if len(brand) < 1:
            raise utility.CrawlerError('Product brand is missing')
        brand = brand[1].string
        ad['brand'] = brand
Пример #4
0
 def parseProductKeywords(self, item, ad):
     ad['keywords'] = utility.cleanedTokenize(ad['title'])
     if len(ad['keywords']) < 1:
         raise utility.CrawlerError('Lack of keywords')