Пример #1
0
 def scrape(self, merchantId):
     if not merchantId:
         return False
     url = "https://www.amazon." + self.region + "/s?merchant=" + merchantId
     print(url)
     content = Model_Scraper_Standard(self.region).processSellerProduct(url)
     if (content):
         result = self.processor.process(content)
         if (result):
             data = []
             data.append(result)
             pagecount = int(self.processor.getPageCount(content))
             pagecount = 1  # 测试
             if (pagecount > 1):
                 if (pagecount > 50):
                     pagecount = 50  # 测试 原为50
                 for i in range(2, pagecount + 1):
                     pageurl = "https://www.amazon." + self.region + "/s?merchant=" + merchantId + "&page=" + str(
                         i)
                     print(pageurl)
                     pageContent = Model_Scraper_Standard(
                         self.region).processSellerProduct(pageurl)
                     if not pageContent:
                         continue
                     pageResult = self.processor.process(pageContent)
                     if (pageResult):
                         data.append(pageResult)
             return data
         return Model_Static_DownloadQueue_Status().SCRAPED_NO_DATA
     return Model_Static_DownloadQueue_Status().FAILED
Пример #2
0
 def scrape(self , region, keywords):
     result = []
     self.process = Model_Scraper_Standard(region)
     requrl = "https://www.amazon."+region+"/s?page="+str(1)+"&keywords="+keywords+"&dataVersion=v0.2&cid=08e6b9c8bdfc91895ce634a035f3d00febd36433&format=json"
     content = self.process.mobile_process(requrl)
     if(content):
         # 解析代码
         # print (content)
         data = self.processor.mobile_process(region, content)
         if(data):
             # print (data)
             result.append(data)
             page_count = content['pagination']['numPages']
             # print (page_count)
             if (int(page_count) > 20):
                 page_count = 20
             for k in range(2, page_count + 1):
                 try:
                     requrl = "https://www.amazon." + region + "/s?page=" + str(k) + "&keywords=" + keywords + "&dataVersion=v0.2&cid=08e6b9c8bdfc91895ce634a035f3d00febd36433&format=json"
                     # print (requrl)
                     content = self.process.mobile_process(requrl)
                     result.append(self.processor.mobile_process(region, content))
                 except Exception as err:
                     print (err)
             try:
                 total = {}
                 total['total'] = content['resultsMetadata']['totalResults']
             except Exception as err:
                 print (err)
             result.append(total)
         return result
Пример #3
0
    def scrape(self, asin, scrapedCount):
        self.process = Model_Scraper_Standard(self.region)
        url = "https://www.amazon." + self.region + "/gp/product-reviews/" + asin + "?sortBy=recent&pageNumber=1"
        print(url)
        content = self.process.processReview(url)
        if (content):
            data = {}
            items = []
            summary = self.processor.getSummary(content.encode('utf-8'))
            if (summary):
                # print (summary)
                data['summary'] = summary
            # 处理首页数据
            result = self.processor.process(content.encode('utf-8'))
            if (result):
                # print (result)
                items.append(result)

            newScrapedCount = 10
            if (data['summary']['page_count']
                    and data['summary']['page_count'] > 0):
                # print (data['summary']['page_count'])
                pageCount = data['summary']['page_count']
                # 已经抓取的页面
                scrapedPageCount = int(floor(int(scrapedCount) / 10))
                # print (scrapedPageCount)
                # 实际需要抓取的页面
                pageCount = pageCount - scrapedPageCount
                # print (pageCount)
                if (pageCount > 20):
                    pageCount = 2
                if (pageCount >= 2):
                    newScrapedCount = pageCount * 10
                    for i in range(2, pageCount + 1):
                        pageUrl = "https://www.amazon." + self.region + "/gp/product-reviews/" + asin + "?sortBy=recent&pageNumber=" + str(
                            i)
                        print(pageUrl)
                        pageContent = self.process.processReview(pageUrl)
                        if not pageContent:
                            continue
                        # 处理page数据
                        pageResult = self.processor.process(
                            pageContent.encode("utf-8"))
                        if (pageResult):
                            # print (pageResult)
                            # items = []
                            items.append(pageResult)
                            # print (items[0])
                            # print (items[1])
            data['list'] = items
            if (len(data) > 0):
                data['new_scraped_count'] = newScrapedCount
                # print (data)
                return data
            else:
                return Model_Static_Scrape_Status.SUCCESS_NO_DATA
        elif (content == None):
            return None
        else:
            return False
Пример #4
0
class Model_Scraper_Keywords_First(Model_Scraper_Standard):
    def __init__(self, region):
        super(Model_Scraper_Keywords_First, self).__init__(region)
        self.region = region
        self.processor = Service_Functions().getProcessor(
            'Keywords_First', region)

    def scraper(self, keywords):
        self.process = Model_Scraper_Standard(self.region)
        url = "https://www.amazon." + self.region + "/gp/search?keywords=" + keywords + "&page=1"
        print(url)
        try:
            content = self.process.processkeywords(url)
        except Exception as err:
            print(err)
        try:
            if (content):
                # 这边写解析代码
                result = self.processor.process(content)
                if (result):
                    return result
            elif (content == None):
                return None
            else:
                return False
        except:
            return False
Пример #5
0
class Model_Scraper_Product_Base(Model_Scraper_Standard):
    def __init__(self, region):
        super(Model_Scraper_Product_Base, self).__init__(region)
        self.region = region
        self.processor = Service_Functions().getProcessor(
            'Product_Base', region)

    def scrape(self, asin):
        self.process = Model_Scraper_Standard(self.region)
        url = "https://www.amazon." + self.region + "/dp/" + asin + "?th=1&psc=1"
        print(url)
        try:
            content = self.process.process(url)
        except Exception as err:
            print(err)
        try:
            if (content):
                # 解析代码
                data = self.processor.process(content.encode('utf-8'))
                if (data):
                    return data
                else:
                    return False
            elif (content == None):
                return None
            else:
                return False
        except:
            return False
Пример #6
0
class Model_Scraper_TopReviewer(Model_Scraper_Standard):
    def __init__(self, region):
        self.region = region
        self.processor = Service_Functions().getProcessor(
            'TopReviewer', region)

    def scrape(self, begin, end):
        if not str(begin).isdigit() and not str(end).isdigit() and begin > end:
            return Model_Static_Scrape_Status.FAILED
        self.process = Model_Scraper_Standard(self.region)
        data = []
        for i in range(begin, end):
            pageUrl = "https://www.amazon." + self.region + "/review/top-reviewers?page=" + str(
                i)
            pageContent = self.process.processTopReviewer(pageUrl)
            if not pageContent:
                continue
            rankEnd = i * 10
            rankBegin = rankEnd - 9
            pageResult = self.processor.process(pageContent, rankBegin,
                                                rankEnd + 1)
            if (pageResult):
                # 数组合并
                data.append(pageResult)
        if (len(data)):
            return data
Пример #7
0
class Model_Scraper_Keywords(Model_Scraper_Standard):
    def __init__(self, region):
        super(Model_Scraper_Keywords, self).__init__(region)
        self.region = region
        self.processor = Service_Functions().getProcessor('Keywords', region)

    def scraper(self, keywords):
        self.process = Model_Scraper_Standard(self.region)
        url = "https://www.amazon."+self.region+"/gp/search?keywords="+keywords+"&page=1"
        # 不显示浏览器
        # with Display(backend="xvfb", size=(1440, 900)):
        print (url)
        try:
            content = self.process.processkeywords(url)
        except Exception as err:
            print (err)
        try:
            if (content):
                # 这边写解析代码
                data = []
                result = self.processor.process(content.encode('utf-8'), 1)
                if (result):
                    # print (result)
                    data.append(result)
                    pagecount = int(self.processor.getPageCount(content))
                    if (pagecount > 5):
                        pagecount = 5
                    # pagecount = 1
                    if (pagecount > 1):
                        for i in range(2, pagecount + 1):
                            pageurl = "https://www.amazon." + self.region + "/gp/search?keywords=" + keywords + "&page=" + str(i)
                            print (pageurl)
                            pagecontent = self.process.processkeywords(pageurl)
                            if (pagecontent):
                                pageresult = self.processor.process(pagecontent.encode('utf-8'), i)
                                # print (pageresult)
                                data.append(pageresult)
                    return data
            elif (content == None):
                return None
            else:
                return False
        except:
            return False
Пример #8
0
 def scraper(self, keywords):
     self.process = Model_Scraper_Standard(self.region)
     url = "https://www.amazon." + self.region + "/gp/search?keywords=" + keywords + "&page=1"
     print(url)
     try:
         content = self.process.processkeywords(url)
     except Exception as err:
         print(err)
     try:
         if (content):
             # 这边写解析代码
             result = self.processor.process(content)
             if (result):
                 return result
         elif (content == None):
             return None
         else:
             return False
     except:
         return False
Пример #9
0
 def scrape(self, merchantId):
     if not merchantId:
         return False
     url = "http://www.amazon." + self.region + "/gp/aag/main?seller=" + merchantId
     content = Model_Scraper_Standard(self.region).processSeller(url)
     if (content):
         data = self.processor.process(content)
         if (data):
             return data
         return Model_Static_DownloadQueue_Status().SCRAPED_NO_DATA
     return Model_Static_DownloadQueue_Status().FAILED
Пример #10
0
 def scrape(self, asin):
     self.process = Model_Scraper_Standard(self.region)
     url = "https://www.amazon." + self.region + "/dp/" + asin + "?th=1&psc=1"
     print(url)
     try:
         content = self.process.process(url)
     except Exception as err:
         print(err)
     try:
         if (content):
             # 解析代码
             data = self.processor.process(content.encode('utf-8'))
             if (data):
                 return data
             else:
                 return False
         elif (content == None):
             return None
         else:
             return False
     except:
         return False
Пример #11
0
class Model_Scraper_Product_Offer1(Model_Scraper_Standard):
    def __init__(self, region):
        self.region = region
        self.processor = Service_Functions().getProcessor(
            'Product_Offer', region)

    def process(self, asin):
        self.processOffer = Model_Scraper_Standard(self.region)
        content = self.processOffer.processOffer(self.region, asin)
        if (content):
            return content

    # def scrapeInventory(self, data):
    #     if (data == '' or data == None):
    #         return Model_Static_Scrape_Status.FAILED
    #     url ="http://www.amazon."+self.region+"/gp/aws/cart/add.html"
    #     fields = []
    #     session_id = None

    def scrape(self, asin):
        content = self.process(asin)
        if (content):
            # 这边写解析代码, 通过解析返回的数据再进行库存的抓取
            print(content)
            data = self.processor.process(content.encode('utf-8'))
            if (data):
                print(data)
                # 通过解析得到的数据进行库存的计算
                # Inventory = self.scrapeInventory(asin, data)
                # print (Inventory)
        pageCount = self.processor.getPageCount(content)
        # print (pageCount)
        if (pageCount > 1):
            for i in range(2, int(pageCount) + 1):
                # print (i)
                index = str((i - 1) * 10)
                pageUrl = "http://www.amazon." + "com" + "/gp/offer-listing/" + asin + "/ref=olpOffersSuppressed?ie=UTF8&f_new=true&overridePriceSuppression=1&startIndex=" + index
                # print (pageUrl)
                pageContent = self.processPageOffer(pageUrl)
                if (pageContent):
                    print(pageContent)
                    pageResult = self.processor.process(
                        pageContent.encode('utf-8'))
                    if (pageResult):
                        print(pageResult)
Пример #12
0
 def process(self, asin):
     self.processOffer = Model_Scraper_Standard(self.region)
     content = self.processOffer.processOffer(self.region, asin)
     if (content):
         return content