Пример #1
0
 def scrape(self, merchantId):
     if not merchantId:
         return False
     url = "https://www.amazon." + self.region + "/s?merchant=" + merchantId
     print(url)
     content = Model_Scraper_Standard(self.region).processSellerProduct(url)
     if (content):
         result = self.processor.process(content)
         if (result):
             data = []
             data.append(result)
             pagecount = int(self.processor.getPageCount(content))
             pagecount = 1  # 测试
             if (pagecount > 1):
                 if (pagecount > 50):
                     pagecount = 50  # 测试 原为50
                 for i in range(2, pagecount + 1):
                     pageurl = "https://www.amazon." + self.region + "/s?merchant=" + merchantId + "&page=" + str(
                         i)
                     print(pageurl)
                     pageContent = Model_Scraper_Standard(
                         self.region).processSellerProduct(pageurl)
                     if not pageContent:
                         continue
                     pageResult = self.processor.process(pageContent)
                     if (pageResult):
                         data.append(pageResult)
             return data
         return Model_Static_DownloadQueue_Status().SCRAPED_NO_DATA
     return Model_Static_DownloadQueue_Status().FAILED
Пример #2
0
    def scrape(self, asin, scrapedCount):
        self.process = Model_Scraper_Standard(self.region)
        url = "https://www.amazon." + self.region + "/gp/product-reviews/" + asin + "?sortBy=recent&pageNumber=1"
        print(url)
        content = self.process.processReview(url)
        if (content):
            data = {}
            items = []
            summary = self.processor.getSummary(content.encode('utf-8'))
            if (summary):
                # print (summary)
                data['summary'] = summary
            # 处理首页数据
            result = self.processor.process(content.encode('utf-8'))
            if (result):
                # print (result)
                items.append(result)

            newScrapedCount = 10
            if (data['summary']['page_count']
                    and data['summary']['page_count'] > 0):
                # print (data['summary']['page_count'])
                pageCount = data['summary']['page_count']
                # 已经抓取的页面
                scrapedPageCount = int(floor(int(scrapedCount) / 10))
                # print (scrapedPageCount)
                # 实际需要抓取的页面
                pageCount = pageCount - scrapedPageCount
                # print (pageCount)
                if (pageCount > 20):
                    pageCount = 2
                if (pageCount >= 2):
                    newScrapedCount = pageCount * 10
                    for i in range(2, pageCount + 1):
                        pageUrl = "https://www.amazon." + self.region + "/gp/product-reviews/" + asin + "?sortBy=recent&pageNumber=" + str(
                            i)
                        print(pageUrl)
                        pageContent = self.process.processReview(pageUrl)
                        if not pageContent:
                            continue
                        # 处理page数据
                        pageResult = self.processor.process(
                            pageContent.encode("utf-8"))
                        if (pageResult):
                            # print (pageResult)
                            # items = []
                            items.append(pageResult)
                            # print (items[0])
                            # print (items[1])
            data['list'] = items
            if (len(data) > 0):
                data['new_scraped_count'] = newScrapedCount
                # print (data)
                return data
            else:
                return Model_Static_Scrape_Status.SUCCESS_NO_DATA
        elif (content == None):
            return None
        else:
            return False
Пример #3
0
 def scrape(self , region, keywords):
     result = []
     self.process = Model_Scraper_Standard(region)
     requrl = "https://www.amazon."+region+"/s?page="+str(1)+"&keywords="+keywords+"&dataVersion=v0.2&cid=08e6b9c8bdfc91895ce634a035f3d00febd36433&format=json"
     content = self.process.mobile_process(requrl)
     if(content):
         # 解析代码
         # print (content)
         data = self.processor.mobile_process(region, content)
         if(data):
             # print (data)
             result.append(data)
             page_count = content['pagination']['numPages']
             # print (page_count)
             if (int(page_count) > 20):
                 page_count = 20
             for k in range(2, page_count + 1):
                 try:
                     requrl = "https://www.amazon." + region + "/s?page=" + str(k) + "&keywords=" + keywords + "&dataVersion=v0.2&cid=08e6b9c8bdfc91895ce634a035f3d00febd36433&format=json"
                     # print (requrl)
                     content = self.process.mobile_process(requrl)
                     result.append(self.processor.mobile_process(region, content))
                 except Exception as err:
                     print (err)
             try:
                 total = {}
                 total['total'] = content['resultsMetadata']['totalResults']
             except Exception as err:
                 print (err)
             result.append(total)
         return result
Пример #4
0
 def scrape(self, merchantId):
     if not merchantId:
         return False
     url = "http://www.amazon." + self.region + "/gp/aag/main?seller=" + merchantId
     content = Model_Scraper_Standard(self.region).processSeller(url)
     if (content):
         data = self.processor.process(content)
         if (data):
             return data
         return Model_Static_DownloadQueue_Status().SCRAPED_NO_DATA
     return Model_Static_DownloadQueue_Status().FAILED
Пример #5
0
 def scraper(self, keywords):
     self.process = Model_Scraper_Standard(self.region)
     url = "https://www.amazon." + self.region + "/gp/search?keywords=" + keywords + "&page=1"
     print(url)
     try:
         content = self.process.processkeywords(url)
     except Exception as err:
         print(err)
     try:
         if (content):
             # 这边写解析代码
             result = self.processor.process(content)
             if (result):
                 return result
         elif (content == None):
             return None
         else:
             return False
     except:
         return False
Пример #6
0
 def scrape(self, begin, end):
     if not str(begin).isdigit() and not str(end).isdigit() and begin > end:
         return Model_Static_Scrape_Status.FAILED
     self.process = Model_Scraper_Standard(self.region)
     data = []
     for i in range(begin, end):
         pageUrl = "https://www.amazon." + self.region + "/review/top-reviewers?page=" + str(
             i)
         pageContent = self.process.processTopReviewer(pageUrl)
         if not pageContent:
             continue
         rankEnd = i * 10
         rankBegin = rankEnd - 9
         pageResult = self.processor.process(pageContent, rankBegin,
                                             rankEnd + 1)
         if (pageResult):
             # 数组合并
             data.append(pageResult)
     if (len(data)):
         return data
Пример #7
0
 def scrape(self, asin):
     self.process = Model_Scraper_Standard(self.region)
     url = "https://www.amazon." + self.region + "/dp/" + asin + "?th=1&psc=1"
     print(url)
     try:
         content = self.process.process(url)
     except Exception as err:
         print(err)
     try:
         if (content):
             # 解析代码
             data = self.processor.process(content.encode('utf-8'))
             if (data):
                 return data
             else:
                 return False
         elif (content == None):
             return None
         else:
             return False
     except:
         return False
Пример #8
0
 def scraper(self, keywords):
     self.process = Model_Scraper_Standard(self.region)
     url = "https://www.amazon."+self.region+"/gp/search?keywords="+keywords+"&page=1"
     # 不显示浏览器
     # with Display(backend="xvfb", size=(1440, 900)):
     print (url)
     try:
         content = self.process.processkeywords(url)
     except Exception as err:
         print (err)
     try:
         if (content):
             # 这边写解析代码
             data = []
             result = self.processor.process(content.encode('utf-8'), 1)
             if (result):
                 # print (result)
                 data.append(result)
                 pagecount = int(self.processor.getPageCount(content))
                 if (pagecount > 5):
                     pagecount = 5
                 # pagecount = 1
                 if (pagecount > 1):
                     for i in range(2, pagecount + 1):
                         pageurl = "https://www.amazon." + self.region + "/gp/search?keywords=" + keywords + "&page=" + str(i)
                         print (pageurl)
                         pagecontent = self.process.processkeywords(pageurl)
                         if (pagecontent):
                             pageresult = self.processor.process(pagecontent.encode('utf-8'), i)
                             # print (pageresult)
                             data.append(pageresult)
                 return data
         elif (content == None):
             return None
         else:
             return False
     except:
         return False
Пример #9
0
 def process(self, asin):
     self.processOffer = Model_Scraper_Standard(self.region)
     content = self.processOffer.processOffer(self.region, asin)
     if (content):
         return content