Пример #1
0
 def sort_second(self, response):
     youxiao = re.search("(category-menu)", response.text)  #判断是否出现左边导航列表
     youxiao1 = re.search("(js-Search-titleCount)",
                          response.text)  #判断是否出现good列表
     sort = response.meta.get("sort")
     if youxiao1:
         self.get_list(response, youxiao1)
         item = GmWorkItem()
         item["sort"] = str(sort)
         item["url"] = response.url
     else:
         url = response.url
         if youxiao:
             headers = self.get_headers(1)
             sort_list = response.css(".category-menu").xpath("./dl/dd//a")
             for i in sort_list:
                 sort_copy = sort.copy()
                 url = i.xpath("./@href").get()
                 name = i.xpath("./text()").get()
                 if re.search("\w", name) and name not in sort_copy:
                     sort_copy.append(name)
                     item = GmWorkItem()
                     item["sort"] = str(sort_copy)
                     item["url"] = url
                     yield item
                     yield scrapy.Request(url=url,
                                          method="GET",
                                          callback=self.sort_second,
                                          headers=headers,
                                          dont_filter=True,
                                          meta={"sort": sort_copy})
         else:
             try_result = self.try_again(response, key=str(sort), url=url)
             yield try_result
Пример #2
0
    def parse(self, response):
        youxiao = re.search("(information-holder|results)", response.text)
        url_key = response.request.url
        id = response.meta.get("id")
        category = response.meta.get("category")
        first_page = response.meta.get("first_page")
        page_num = response.meta.get("page_num", 1)
        if youxiao:
            item_s = GmWorkItem()
            item_s["url"] = url_key
            item_s["source_code"] = response.text
            yield item_s
            goods_num = response.css(".results").xpath("./text()").get()  #总商品数
            if goods_num:
                match = re.search("of ([^ ]+) results", goods_num)
                if match:
                    goods_num = match.group(1)
                    goods_num = goods_num.replace(",", "")
            shop_list = response.css(".row.information-holder").xpath("./a")
            if not shop_list:
                print("shop_list有url没有选取", id)
            for i in shop_list:
                url = i.xpath("./@href").get()
                name = i.xpath("./span/text()").get()
                price = i.xpath("./strong/text()").get()
                url = "https://www.fruugo.co.uk" + url

                item = GmWorkItem()
                item["key"] = url_key
                item["name"] = name
                item["url"] = url
                item["price"] = price
                item["goods_num"] = goods_num
                item["category"] = category
                yield item
            if first_page and goods_num:
                headers = self.get_headers(1)
                limitnum = 1000
                per_page = 64
                num = self.get_pagenum(int(goods_num), per_page)
                if num > limitnum:
                    num = limitnum
                for i in range(2, num + 1):
                    next_url = url_key + "?page={}".format(i)
                    meta = {
                        "id": id,
                        "category": category,
                        "page_num": page_num
                    }
                    yield scrapy.Request(url=next_url,
                                         method="GET",
                                         headers=headers,
                                         dont_filter=True,
                                         meta=meta)
        else:
            try_result = self.try_again(response, url=url_key)
            yield try_result
Пример #3
0
 def parse(self, response):
     youxiao = re.search("(olpOfferList|olpProduct)", response.text)
     key = response.meta.get("key")
     if youxiao:
         item_s = GmWorkItem()
         item_s["key"] = key
         item_s["source_code"] = response.text
         yield item_s
         shop_list = response.css(
             ".a-section.a-spacing-double-large").xpath(
                 "./div//h3[@class='a-spacing-none olpSellerName']/a")
         if not shop_list:
             item = GmWorkItem()
             item["key"] = key
             item["name"] = ""
             item["url"] = ""
             item["seller_id"] = ""
             yield item
         for i in shop_list:
             name = i.xpath("./text()").get()
             if name:
                 name = name.strip()
             url = i.xpath("./@href").get()
             seller_id = ""
             match = re.search('(s|seller)=(.*?)($|[&])', url)
             if match:
                 seller_id = match.group(2)
             item = GmWorkItem()
             item["key"] = key
             item["name"] = name
             item["url"] = url
             item["seller_id"] = seller_id
             yield item
         next_url = response.css("li.a-last").xpath("./a/@href").get()
         if next_url:
             next_url = "https://www.amazon.co.uk" + next_url
             yield scrapy.Request(url=next_url,
                                  method="GET",
                                  headers=self.headers,
                                  meta={"key": key})
     else:
         try_result = self.try_again(response, key)
         yield try_result
Пример #4
0
 def sort_all(self, response):
     youxiao = re.search('("data")', response.text)
     key = "sort"
     if response.status == 200 and youxiao:
         item_s = GmWorkItem()
         item_s["key"] = key
         item_s["source_code"] = response.text
         yield item_s
         json_data = json.loads(response.text)
         data = json_data.get("data")
         for i in data:
             main = i.get("main")
             catid = main.get("catid")
             name = main.get("name")
             sub = i.get("sub")
             for j in sub:
                 sub_sub = j.get("sub_sub")
                 catid_sub = j.get("catid")
                 name_sub = j.get("name")
                 if sub_sub:
                     for x in sub_sub:
                         name_sub2 = x.get("display_name")
                         catid_sub2 = x.get("catid")
                         item = GmWorkItem()
                         item["catid"] = catid
                         item["category"] = name
                         item["catid_sub"] = catid_sub
                         item["category1"] = name_sub
                         item["catid_sub2"] = catid_sub2
                         item["category2"] = name_sub2
                         yield item
                 else:
                     item = GmWorkItem()
                     item["catid"] = catid
                     item["category"] = name
                     item["catid_sub"] = catid_sub
                     item["category1"] = name_sub
                     item["catid_sub2"] = ""
                     item["category2"] = ""
     else:
         yield self.try_again(response, key=key)
Пример #5
0
 def sort_all(self, response):
     youxiao = re.search("(About seller|Sprzedający)", response.text)
     url = response.request.url
     if youxiao:
         seller_id = ""
         positive_number = ""
         bad_number = ""
         match = re.search('"sellerId":"(.*?)"', response.text)
         if match:
             seller_id = match.group(1)
         positive_feedback = response.css(
             ".a7caa336.d7c56f78._476b319e").xpath("./text()").get()
         number = response.css(".fa4668cc").xpath("./text()").getall()
         if len(number) == 2:
             positive_number = number[0]
             bad_number = number[1]
         year = response.css("._1604f5d6._82f13583").xpath(
             "./div/div/text()").get()
         match = re.search('({"leftLink".*?"hideContact":.*?})',
                           response.text)
         regon = ""
         nip = ""
         company_data = []
         if match:
             data_str = match.group(1)
             try:
                 data = json.loads(data_str)
                 company_data = data.get("companyData")
                 for i in company_data:
                     if "REGON" in i:
                         regon = i
                     if "NIP" in i:
                         nip = i
             except:
                 pass
         # else:
         #     try_result = self.try_again(response, url=url)
         #     yield try_result
         item = GmWorkItem()
         item["seller_id"] = seller_id
         item["positive_feedback"] = positive_feedback
         item["positive_number"] = positive_number
         item["bad_number"] = bad_number
         item["year"] = year
         item["regon"] = regon
         item["nip"] = nip
         item["company_data"] = str(company_data)
         yield item
     else:
         try_result = self.try_again(response, url=url)
         yield try_result
Пример #6
0
 def get_list(self, response, youxiao=None):
     if not youxiao:
         youxiao = re.search("(js-Search-titleCount)", response.text)
     if youxiao:
         url = response.url
         category_list = []
         page_num = response.meta.get("page_num")
         headers = self.get_headers(1)
         category = response.css(".Breadcrumb-list").xpath("./li")
         for i in category:
             category = i.xpath("./a/span/text()").get()
             if not category:
                 category = i.xpath("./span[2]/text()").get()
             if category:
                 category_list.append(category)
         count = response.css(".Search-titleCount.js-Search-titleCount"
                              ).xpath("./text()").get()
         if count:
             match = re.search("\((\d*)\)", count)
             count = 0
             if match:
                 count = match.group(1)
         goods_list = response.css(
             ".clearfix.Article-item.js-Search-hashLinkId")
         for i in goods_list:
             good_url = i.css(".Article-desc").xpath("./span/a/@href").get()
             good_name = i.css(".Article-desc").xpath(
                 "./span/a/text()").get()
             good_id = i.xpath("./@id").get()
             score = i.css(".Article-rate.js-bestReview").xpath(
                 "./span/span[1]/text()").get()
             score_number = i.css(".Article-rate.js-bestReview").xpath(
                 "./span/span[2]/text()").get()
             if score_number:
                 match = re.search("\((.*?)\)", score_number)
                 if match:
                     score_number = match.group(1)
             price = i.css(".userPrice").xpath("./text()").get()
             if price:
                 price = price.strip()
             goodshop_url = i.css(".OffersSumary.clearfix").xpath(
                 "./a/@href").get()
             item = GmWorkItem()
             item["good_url"] = good_url
             item["good_name"] = good_name
             item["good_id"] = good_id
             item["score"] = score
             item["score_number"] = score_number
             item["price"] = price
             item["goodshop_url"] = goodshop_url
             item["category"] = str(category)
Пример #7
0
 def get_detail(self, response):
     seller_id = response.meta.get("seller_id")
     youxiao = re.search("(SUCCESS::调用成功)", response.text)
     if youxiao:
         try:
             match = re.search(" mtopjsonp3\((.*)\)", response.text)
             if match:
                 json_str = match.group(1)
                 json_data = json.loads(json_str)
                 data = json_data.get("data")
                 anchorId = data.get("anchorId")
                 nick = data.get("nick")
                 relation = data.get("relation")
                 fansCount = relation.get("fansCount")
                 followTopCount = relation.get("followTopCount")
                 liveCount = relation.get("liveCount")
                 replays = data.get("replays")
                 info_list = []
                 viewer_totle = 0
                 for i in replays:
                     info_dict = dict()
                     liveId = i.get("liveId")
                     liveTime = i.get("liveTime")
                     roomTypeName = i.get("roomTypeName")
                     title = i.get("title")
                     viewerCount = i.get("viewerCount")
                     info_dict["liveId"] = liveId
                     info_dict["liveTime"] = liveTime
                     info_dict["roomTypeName"] = roomTypeName
                     info_dict["title"] = title
                     info_dict["viewerCount"] = viewerCount
                     info_list.append(info_dict)
                     viewer_totle += int(viewerCount)
                 live_info = json.dumps(info_list)
                 item = GmWorkItem()
                 item["anchor_id"] = anchorId
                 item["nick"] = nick
                 item["fans_count"] = fansCount
                 item["follow_count"] = followTopCount
                 item["live_count"] = liveCount
                 item["viewer_totle"] = viewer_totle
                 item["live_info"] = live_info
                 yield item
         except Exception as e:
             print(e)
             try_result = self.try_again(response, seller_id=seller_id)
             yield try_result
     else:
         try_result = self.try_again(response, seller_id=seller_id)
         yield try_result
Пример #8
0
 def sort_all(self,response):
     youxiao = re.search("(StoreState_base)",response.text)
     url = response.url
     if youxiao:
         match = re.search("StoreState_base'] = ({.*?});</script>",response.text)
         if match:
             data_str = match.group(1)
             try:
                 data = json.loads(data_str)
                 items = data.get("items")
                 items_groups = items.get("itemsGroups",{})
                 for i in items_groups:
                     good_list = i.get("items")
                     for j in good_list:
                         good_id = j.get("id")
                         good_url = j.get("url")
                         location = j.get("location",{})
                         city = location.get("city")
                         title = j.get("title",{})
                         good_name = title.get("text")
                         status = j.get("type")
                         price_json = j.get("price",{})
                         normal = price_json.get("normal",{})
                         price = normal.get("amount")
                         sales = j.get("bidInfo")
                         seller = j.get("seller",{})
                         shop_id = seller.get("id")
                         shop_super = seller.get("superSeller")
                         shop_name = seller.get("login")
                         sort_id = j.get("categoryPath")
                         item = GmWorkItem()
                         item["id"] = good_id
                         item["goods_url"] = good_url
                         item["city"] = city
                         item["good_name"] = good_name
                         item["status"] = status
                         item["price"] = price
                         item["sales_num"] = sales
                         item["shop_id"] = shop_id
                         item["shop_super"] = shop_super
                         item["shop_name"] = shop_name
                         item["sort_id"] = sort_id
                         yield item
             except:
                 try_result = self.try_again(response, url=url)
                 yield try_result
         else:
             try_result = self.try_again(response, url=url)
             yield try_result
Пример #9
0
 def get_sellerid(self, response):
     meta = response.meta
     seller_id = meta.get("seller_id")
     shop_id = meta.get("shop_id")
     status = response.status
     if status == 200:
         if "allProducts" not in response.text:
             print(shop_id,seller_id)
         item = GmWorkItem()
         item["shop_id"] = shop_id
         item["seller_id"] = seller_id
         yield item
     else:
         yield
         print("302:",shop_id, seller_id)
Пример #10
0
 def try_again(self,rsp,key):
     max_num = 5
     meta = rsp.meta
     try_num = meta.get("try_num",0)
     if try_num > max_num:
         try_num += 1
         request = rsp.request
         request.dont_filter = True
         request.meta["try_num"] = try_num
         return request
     else:
         item_e = GmWorkItem()
         item_e["error_id"] = 1
         item_e["key"] = key
         return item_e
Пример #11
0
    def parse(self, response):
        youxiao = re.search("(gS4GqiXvRSi8oJgNBVklGA)", response.text)
        url = response.url
        if youxiao:
            item_s = GmWorkItem()
            item_s["url"] = url
            item_s["source_code"] = response.text
            yield item_s
            shop_list = response.xpath(
                "//div[@data-box-id='gS4GqiXvRSi8oJgNBVklGA==']/div/ul//a")
            if not shop_list:
                print("shop_list有url没有选取")
            for i in shop_list:
                url = i.xpath("./@href").get()
                name = i.xpath("./text()").get()
                url = "https://allegro.pl" + url
                item = GmWorkItem()
                item["name"] = name
                item["url"] = url
                yield item

        else:
            try_result = self.try_again(response, url=url)
            yield try_result
Пример #12
0
 def sort_all(self, response):
     if response.status == 200:
         headers = self.get_headers(1)
         sort_all = response.css(".container-header._1s2v1._n2pii._sdhee")
         for i in sort_all:
             sort_url = i.xpath("./small/a/@href").get()
             if sort_url:
                 sort_url = "https://allegro.pl" + sort_url
                 item = GmWorkItem()
                 item["url"] = sort_url
                 yield scrapy.Request(url=sort_url,
                                      method="GET",
                                      headers=headers,
                                      dont_filter=True)
             else:
                 print("sort_all有url没有选取")
Пример #13
0
 def try_again(self,rsp,**kwargs):
     max_num = -1
     meta = rsp.meta
     try_num = meta.get("try_num",0)
     if try_num < max_num:
         try_num += 1
         request = rsp.request
         request.dont_filter = True
         request.meta["try_num"] = try_num
         return request
     else:
         item_e = GmWorkItem()
         item_e["error_id"] = 1
         for i in kwargs:
             item_e[i] = kwargs[i]
         return item_e
Пример #14
0
    def parse_shop(self, response):
        url = response.url
        youxiao = re.search('("error_msg":null)', response.text)
        shop_id = response.meta.get("shop_id")
        if youxiao:
            try:
                items = json.loads(response.text)
                data = items.get("data")
                if data:
                    name = data.get("name")
                    description = data.get("description")
                    country = data.get("country")
                    place = data.get("place")
                    item_count = data.get("item_count")
                    rating_star = data.get("rating_star")
                    shop_location = data.get("shop_location")
                    follower_count = data.get("follower_count")  #粉丝数
                    rating_good = data.get("rating_good")  #好评数
                    rating_bad = data.get("rating_bad")  # 差评数
                    cancellation_rate = data.get("cancellation_rate")  # 退货率

                    item = GmWorkItem()
                    item["shop_id"] = shop_id
                    item["name"] = name
                    item["description"] = description
                    item["country"] = country
                    item["place"] = place
                    item["follower_count"] = follower_count
                    item["rating_good"] = rating_good
                    item["rating_bad"] = rating_bad
                    item["cancellation_rate"] = cancellation_rate
                    item["url"] = url
                    item["item_count"] = item_count
                    item["rating_star"] = rating_star
                    item["shop_location"] = shop_location
                    item["pipeline_level"] = "店铺信息"
                    yield item
            except Exception as e:
                print(e)
                yield self.try_again(response,
                                     shop_id=shop_id,
                                     pipeline_level="店铺信息")
        else:
            print("无效:{}".format(url))
            yield self.try_again(response,
                                 shop_id=shop_id,
                                 pipeline_level="店铺信息")
Пример #15
0
 def try_again(self, rsp, shop_id, seller_id, page_num):
     max_num = 5
     meta = rsp.meta
     try_num = meta.get("try_num", 0)
     if try_num < max_num:
         try_num += 1
         request = rsp.request
         request.dont_filter = True
         request.meta["try_num"] = try_num
         return request
     else:
         item_e = GmWorkItem()
         item_e["error_id"] = 1
         item_e["shop_id"] = shop_id
         item_e["seller_id"] = seller_id
         item_e["page_num"] = page_num
         return item_e
Пример #16
0
    def parse(self, response):
        youxiao = re.search("(Information)",response.text)
        url = response.url
        key = response.meta.get("key")
        if youxiao:
            title = response.css(".b-title").xpath("./text()").get()

            item = GmWorkItem()
            item["key"] = key
            item["url"] = url
            item["company_name"] = title

            yield item

        else:
            print("错误")
            try_result = self.try_again(response,key)
            yield try_result
Пример #17
0
 def sale_money(self, response):
     effective = '"success":true'
     meta = response.meta
     key = meta.get("key")
     if re.search(effective,response.text):
         companyName = ""
         ordAmt = ""
         ordCnt6m = ""
         company_type = ""
         companyJoinYears = ""
         match = re.search(r'\\"companyName\\":\\"(.*?)\\"',response.text)
         if match:
             companyName = match.group(1)
         match1 = re.search(r'\\"ordAmt\\":\\"(.*?)\\"',response.text)
         if match1:
             ordAmt = match1.group(1)
             ordAmt = ordAmt.replace(",","")
             ordAmt = ordAmt.replace("+","")
         match2 = re.search(r'\\"ordCnt6m\\":(\d*)',response.text)
         if match2:
             ordCnt6m = match2.group(1)
         match3 = re.search(r'\\"value\\":\\"(.*?)\\"', response.text)
         if match3:
             company_type = match3.group(1)
         match3 = re.search(r'\\"companyJoinYears\\":\\"(.*?)\\"', response.text)
         if match3:
             companyJoinYears = match3.group(1)
         item = GmWorkItem()
         item["key"] = key
         item["company_name"] = companyName
         item["sales_money"] = ordAmt
         item["sales_num"] = ordCnt6m
         item["company_type"] = company_type
         item["keep_time"] = companyJoinYears
         item["pipeline_level"] = "销量"
         yield item
     else:
         try_result = self.try_again(response, key)
         yield try_result
Пример #18
0
 def parse(self, response):
     meta = response.meta
     id = meta.get("id")
     youxiao = re.search('(keyMoveItem)', response.text)
     if youxiao:
         company = response.css(".zx-list-item-url").xpath("./text()").get()
         legal_person = response.css(".legal-txt").xpath("./text()").get()
         area = response.css(".zx-ent-props").xpath(
             "./span/span[contains(text(),'地址')]/../text()").get()
         id_s = response.css(".zx-ent-hit-reason-text").xpath(
             "./em/text()").get()
         item = GmWorkItem()
         item["id"] = id
         item["company"] = company
         item["legal_person"] = legal_person
         item["area"] = area
         item["id_s"] = id_s
         yield item
     else:
         print("{}错误了".format(id))
         try_result = self.try_again(response, id=id)
         yield try_result
Пример #19
0
    def detail_data(self, response):
        uid = response.meta.get("uid")
        nickname = response.meta.get("nickname")
        signature = response.meta.get("signature")
        labels = response.meta.get("labels")
        match = re.search('handle', response.text)
        if match:
            info_list = response.css(".handle").xpath("./div/ul/li")
            gift_num = "0"
            getgift_num = "0"
            praise_num = "0"
            fans_num = "0"

            for i in info_list:
                name = i.xpath("./p/text()").get()
                value = i.xpath("./h4/text()").get()
                if "送礼" in name:
                    gift_num = value.strip()
                elif "收礼" in name:
                    getgift_num = value.strip()
                elif "赞" in name:
                    praise_num = value.strip()
                elif "粉丝" in name:
                    fans_num = value.strip()

            item = GmWorkItem()
            item["up_id"] = uid
            item["nick"] = nickname
            item["signature"] = signature
            item["labels"] = str(labels)
            item["gift_num"] = gift_num
            item["getgift_num"] = getgift_num
            item["ol"] = praise_num
            item["fans"] = fans_num
            yield item
        else:
            try_result = self.try_again(response, url=response.url)
            yield try_result
Пример #20
0
 def baidu_second(self,response):
     meta = response.meta
     company = meta.get("company")
     address = meta.get("address")
     youxiao = re.search('("status":0)',response.text)
     if youxiao:
         data_json = json.loads(response.text)
         result = data_json.get("result")
         addressComponent = result.get("addressComponent")
         province = addressComponent.get("province")
         city = addressComponent.get("city")
         district = addressComponent.get("district")
         item = GmWorkItem()
         item["company"] = company
         item["address"] = address
         item["province"] = province
         item["city"] = city
         item["district"] = district
         yield item
     else:
         print("百度第一步{}错误了".format(company))
         try_result = self.try_again(response, company=company,address=address)
         yield try_result
Пример #21
0
 def parse(self, response):
     meta = response.meta
     company = meta.get("company")
     address = meta.get("address")
     youxiao = re.search('("status":"1")',response.text)
     if youxiao:
         data_json = json.loads(response.text)
         geocodes = data_json.get("geocodes")
         if geocodes:
             data = geocodes[0]
             province = data.get("province")
             city = data.get("city")
             district = data.get("district")
             item = GmWorkItem()
             item["company"] = company
             item["address"] = address
             item["province"] = province
             item["city"] = city
             item["district"] = district
             yield item
     else:
         print("{}错误了".format(company))
         try_result = self.try_again(response, company=company,address=address)
         yield try_result
Пример #22
0
    def parse(self, response):
        youxiao = re.search("(HTTP 404|Information|302 Found)",response.text)
        url = response.url
        key = response.meta.get("key")
        if youxiao:
            text = response.text
            # item_s = GmWorkItem()
            # item_s["key"] = key
            # item_s["source_code"] = text
            # yield item_s
            address_detail = ""
            company_name = ""
            val_judge = 0
            contact_table = response.css(".contact-table").xpath("./tr")
            if not contact_table:
                contact_table = response.css(".company-info-data.table").xpath("./tr")
                val_judge = 1
            for i in contact_table:
                name = i.xpath("./th").xpath("string(.)").get()
                if val_judge:
                    value = i.xpath("./td[2]").xpath("string(.)").get()
                else:
                    value = i.xpath("./td").xpath("string(.)").get()
                if name and "Address" in name:
                    address_detail = value
                if name and "Company Name" in name:
                    company_name = value
            country = ""
            province = ""
            city = ""
            address = ""
            zip = ""
            info_table = response.css(".info-table").xpath("./tr")
            if not info_table:
                info_table = response.css(".public-info").xpath("./dl")
                for i in range(len(info_table.xpath("./dt"))):
                    name = info_table.xpath("./dt[{}]".format(i+1)).xpath("string(.)").get()
                    value = info_table.xpath("./dd[{}]".format(i+1)).xpath("string(.)").get()
                    if name and "Country" in name:
                        country = value
                    if name and "Province" in name:
                        province = value
                    if name and "City" in name:
                        city = value
                    if name and "Zip" in name:
                        zip = value
                    if name and "Address" in name:
                        address = value
            else:
                for i in info_table:
                    name = i.xpath("./th").xpath("string(.)").get()
                    value = i.xpath("./td").xpath("string(.)").get()
                    if name and "Country" in name:
                        country = value
                    if name and "Province" in name:
                        province = value
                    if name and "City" in name:
                        city = value
                    if name and "Zip" in name:
                        zip = value
                    if name and "Address" in name:
                        address = value
            contact_people = response.css(".contact-name").xpath("./text()").get()
            if not contact_people:
                contact_people = response.css(".name").xpath("./text()").get()
            companyJoinYears = response.css(".join-year").xpath("./span/text()").get()
            company_type = response.css(".business-type").xpath("./text()").get()
            ordCnt6m = response.css(".transaction-number-value").xpath("./text()").get()
            ordAmt = response.css(".transaction-amount-value").xpath("./text()").get()
            if ordAmt:
                ordAmt = ordAmt.replace(",", "")
                ordAmt = ordAmt.replace("+", "")
            item = GmWorkItem()
            item["key"] = key
            item["url"] = url
            item["company_name"] = company_name
            item["address_detail"] = address_detail
            item["country"] = country
            item["province"] = province
            item["city"] = city
            item["address"] = address
            item["zip"] = zip
            item["contact_people"] = contact_people

            item["sales_money"] = ordAmt
            item["sales_num"] = ordCnt6m
            item["company_type"] = company_type
            item["keep_time"] = companyJoinYears
            yield item
            if response.status == 200:
                bizId = ""
                host_token = ""
                siteId = ""
                pageId = ""

                match = re.search("bizId%22%3A(.*?)%2C%22",text)
                if match:
                    bizId = match.group(1)
                match1 = re.search("host_token:'(.*?)'",text)
                if match1:
                    host_token = match1.group(1)
                match2 = re.search("siteId%22%3A(.*?)%2C%22",text)
                if match2:
                    siteId = match2.group(1)
                match3 = re.search("pageId%22%3A(.*?)%2C%22",text)
                if match3:
                    pageId = match3.group(1)
                language = "en_US"
                envMode = "product"
                renderType = "component"
                componentKeys = "companyCard"
                data = {"bizId": bizId, "language": language,"envMode":envMode,"hostToken":host_token,
                        "siteId":siteId,"pageId":pageId,"renderType":renderType,"componentKeys":componentKeys}
                meta = {"key":key}
                sale_url = "https://{}.alibaba.com/event/app/alisite/render.htm".format(key)
                if bizId and host_token and siteId and pageId:
                    yield scrapy.FormRequest(url=sale_url,callback=self.sale_money,formdata=data,meta=meta)
        else:
            try_result = self.try_again(response,key)
            yield try_result
Пример #23
0
    def data_parse(self, response):
        jiaoyan = "Success"
        meta = response.meta
        pinyin = meta.get("pinyin")
        name = meta.get("name")
        city_id = meta.get("city_id")
        page = meta.get("page")

        if jiaoyan in response.text:
            data_json = json.loads(response.text)
            Response = data_json.get("Response")
            resultTitle = Response.get("resultTitle")
            hotel_num = re.sub("\D", "", resultTitle)
            #添加剩余页面
            if page == "1" and hotel_num:
                totle_num = int(int(hotel_num) /
                                20) + 1 if int(hotel_num) % 20 else int(
                                    int(hotel_num) / 20)

                # if totle_num>2:#test
                #     totle_num = 2

                headers = self.get_headers(2)
                home_url = "https://hotels.ctrip.com/hotels/listPage?cityename={}&city={}".format(
                    pinyin, city_id)
                url = "https://m.ctrip.com/restapi/soa2/16709/HotelSearch"
                headers["Referer"] = home_url
                for i in range(2, totle_num + 1):
                    data = '''{"meta":{"fgt":"","hotelId":"","priceToleranceData":"","priceToleranceDataValidationCode":"","mpRoom":[],"hotelUniqueKey":"","shoppingid":""},"seqid":"","deduplication":[],"filterCondition":{"star":[],"rate":"","rateCount":[],"priceRange":{"lowPrice":0,"highPrice":-1},"priceType":"","breakfast":[],"payType":[],"bedType":[],"bookPolicy":[],"bookable":[],"discount":[],"zone":[],"landmark":[],"metro":[],"airportTrainstation":[],"location":[],"cityId":[],"amenty":[],"promotion":[],"category":[],"feature":[],"brand":[],"popularFilters":[]},"searchCondition":{"sortType":"1","adult":1,"child":0,"age":"","pageNo":页数,"optionType":"","optionId":"","lat":0,"destination":"","keyword":"","cityName":"城市名称","lng":0,"cityId":城市id,"checkIn":"入住时间","checkOut":"离店时间","roomNum":1,"mapType":"gd","travelPurpose":0,"countryId":1,"url":"酒店url","pageSize":20,"timeOffset":28800,"radius":0,"directSearch":0},"queryTag":"NORMAL","genk":true,"genKeyParam":"a=0,b=入住时间,c=离店时间,d=zh-cn,e=2","webpSupport":true,"platform":"online","pageID":"102002","head":{"Version":"","userRegion":"CN","Locale":"zh-CN","LocaleController":"zh-CN","TimeZone":"8","Currency":"CNY","PageId":"102002","webpSupport":true,"userIP":"","P":"","ticket":"","clientID":"","Union":{"AllianceID":"","SID":"","Ouid":""},"HotelExtension":{"group":"CTRIP","hasAidInUrl":false,"Qid":"","hotelUuidKey":"","hotelUuid":""}}}'''
                    data = data.replace("页数", str(i))
                    data = data.replace("城市名称", name)
                    data = data.replace("城市id", city_id)
                    data = data.replace("酒店url", home_url)
                    meta = {
                        "pinyin": pinyin,
                        "name": name,
                        "city_id": city_id,
                        "page": str(i)
                    }
                    yield scrapy.Request(url=url,
                                         callback=self.data_parse,
                                         method="POST",
                                         body=data,
                                         headers=headers,
                                         dont_filter=True,
                                         meta=meta)

            hotelList = Response.get("hotelList", dict())
            list_data = hotelList.get("list")
            for i in list_data:
                base = i.get("base")
                hotelId = base.get("hotelId")
                hotelEnName = base.get("hotelEnName")
                hotelName = base.get("hotelName")
                tags = str(base.get("tags", ""))
                comment = i.get("comment")
                content = comment.get("content", "")
                comment_num = re.sub("\D", "", content)
                money = i.get("money")
                price = money.get("price")
                position = i.get("position")
                cityName = position.get("cityName")
                area = position.get("area")
                address = position.get("address")
                score = i.get("score")
                number = score.get("number")
                item = GmWorkItem()
                item["hotel_num"] = hotel_num
                item["hotel_id"] = hotelId
                item["hotel_name"] = hotelName
                item["hotel_enname"] = hotelEnName
                item["tag"] = tags
                item["price"] = price
                item["city"] = cityName
                item["area"] = area
                item["address"] = address
                item["comment_num"] = comment_num
                item["comment"] = number
                yield item
        else:
            try_result = self.try_again(response,
                                        pinyin=pinyin,
                                        name=name,
                                        id=city_id,
                                        page=page)
            yield try_result
Пример #24
0
    def parse(self, response):
        youxiao = re.search('("error":null)', response.text)
        meta = response.meta
        match_id = meta.get("match_id")
        page = meta.get("page")
        url = response.request.url
        headers = self.get_headers(1)
        if youxiao:
            try:
                data = json.loads(response.text)
                if page == 1:
                    total_count = data.get("total_count")
                    if total_count > 5000:
                        total_count = 5000

                    pages = int(total_count /
                                50) * 50 if total_count % 50 == 0 else int(
                                    total_count / 50) * 50 + 50

                    for i in range(50, pages, 50):
                        page_num_r = i
                        new_page = int((page_num_r + 50) / 50)
                        meta_r = {"match_id": match_id, "page": new_page}
                        url_r = 'https://shopee.com.my/api/v2/search_items/?by=sales&limit=50&match_id={}&newest={}&order=desc&page_type=search&version=2'.format(
                            match_id, page_num_r)
                        yield scrapy.Request(url=url_r,
                                             headers=headers,
                                             meta=meta_r)
                items = data['items']
                if items != None or items != []:
                    for i in items:
                        shop_id = i.get("shopid")
                        goods_id = i.get("itemid")
                        name = i.get("name")
                        price = i.get("price")  #价格
                        if price:
                            price = price / 100000
                        currency = i.get("currency")  # 币种
                        historical_sold = i.get("historical_sold")  # 历史销量
                        sales_num = i.get("sold")
                        stock = i.get("stock")  # 库存
                        item_rating = i.get("item_rating")
                        rating_star = ""
                        if item_rating:
                            rating_star = item_rating.get('rating_star')  # 评分
                        item_status = i.get("item_status")  # 商品状态
                        show_free_shipping = i.get("show_free_shipping")  # 免邮
                        brand = i.get("brand")  # 品牌
                        location = i.get("shop_location")
                        view_count = i.get("view_count")
                        item = GmWorkItem()
                        item["shop_id"] = shop_id
                        item["goods_id"] = goods_id
                        item["name"] = name
                        item["price"] = price
                        item["currency"] = currency
                        item["totle_num"] = historical_sold  #历史销量
                        item["sales_num"] = sales_num
                        item["stock"] = stock
                        item["rating_star"] = rating_star
                        item["item_status"] = item_status
                        item["show_free_shipping"] = show_free_shipping
                        item["brand"] = brand
                        item["url"] = url
                        item["location"] = location
                        item["view_count"] = view_count
                        item["pipeline_level"] = "list"

                        yield item

                        shop_url = self.shop_url.format(shop_id)  #goods详情页
                        yield scrapy.Request(url=shop_url,
                                             headers=headers,
                                             callback=self.parse_shop,
                                             meta={'shop_id': shop_id})
                else:
                    print("为空:{}".format(url))
            except Exception as e:
                print(e)
                yield self.try_again(response, match_id=match_id, page=page)
        else:
            print("无效:{}".format(url))
            yield self.try_again(response, match_id=match_id, page=page)
Пример #25
0
    def get_detail(self, response):
        meta = response.meta
        json_str = response.text
        req_url = response.url
        seller_id = meta.get("seller_id")
        shop_id = meta.get("shop_id")
        page_id = meta.get("page_id")
        if json_str.startswith('{"'):
            item_s = GmWorkItem()
            item_s["source_code"] = json_str
            yield item_s
            json_data = json.loads(json_str)
            # success = json_data.get("success")
            data = json_data.get("data")
            # nextUrl = data.get("nextUrl")
            items = data.get("items")
            # if not items:
            #     print("item为空",shop_id,req_url)

            trace = data.get("trace")
            page = trace.get("page")

            aem_count = int(
                page.get("aem_count")) if page.get("aem_count") else 0
            if aem_count:
                self.goods_num += aem_count
                if self.goods_num % 100000 == 1:
                    print(self.goods_num)

                for i in range(20, aem_count, 20):
                    url = "https://m.aliexpress.com/api/search/products/items?pageId={}&searchType=storeSearch&sellerAdminSeq={}&storeId={}&infiniteScroll=true&start={}&shipToCountry=US&__amp_source_origin=https%3A%2F%2Fm.aliexpress.com"
                    Referer_str = "https://m.aliexpress.com/storesearch/list/.html?sortType=TC3_D&searchType=storeSearch&trace=store2mobilestoreNew&storeId={}"
                    cookies = "aefeMsite=amp--wRru0loiCNZjcQEqYc1Ew; ali_apache_id=11.180.122.26.1575437527682.392996.5; isg=BDEx-5kOyCf7m2SmkQaxvTBcQL0LtqIM-G1_rBNGL_giOlOMW256Y8wcWIj58j3I"

                    Referer = Referer_str.format(shop_id)
                    url = url.format(page_id, seller_id, shop_id, i)
                    headers = self.get_headers()
                    headers["Cookie"] = cookies
                    headers["Referer"] = Referer
                    meta = {
                        "page_id": page_id,
                        "seller_id": seller_id,
                        "shop_id": shop_id
                    }
                    yield scrapy.Request(url=url,
                                         callback=self.get_detail,
                                         method="GET",
                                         headers=headers,
                                         meta=meta)

            for good in items:
                item = GmWorkItem()
                goods_url = good.get("action")
                averageStarStr = good.get("averageStarStr")
                imgUrl = good.get("imgUrl")

                price = good.get("price")
                price1 = price.get("price")
                price_currency = price1.get("currency")
                price_value = price1.get("value")
                productId = good.get("productId")
                subject = good.get("subject")

                item["shop_id"] = shop_id
                item["seller_id"] = seller_id
                item["goods_url"] = goods_url
                item["average_score"] = averageStarStr
                item["img_url"] = imgUrl
                item["currency"] = price_currency
                item["price"] = price_value
                item["goods_id"] = productId
                item["subject"] = subject
                item["shop_id"] = shop_id
                item["aem_count"] = aem_count

                item["pipeline_level"] = "smt商品列表"
                yield item
        else:
            yield self.try_again(response)
Пример #26
0
 def parse(self, response):
     youxiao = re.search("(product-title|no longer available)",
                         response.text)
     url_key = response.request.url
     if youxiao:
         item_s = GmWorkItem()
         item_s["url"] = url_key
         item_s["source_code"] = response.text
         yield item_s
         good_name = response.css(".mb-8.js-product-title").xpath(
             "./text()").get()
         shop_name = response.css(".Product__Title.js-break-md-right"
                                  ).xpath("./p/a/text()").get()
         shop_url = response.css(".Product__Title.js-break-md-right").xpath(
             "./p/a/@href").get()
         price = response.css(".price.js-meta-price").xpath(
             "./text()").get()
         if price:
             price = re.sub("[^\d\.]", "", price)
         product = response.css(
             ".table.table-striped.a11y-text-width").xpath("./tr")
         brand = ""
         category = ""
         size = ""
         fruugo_id = ""
         ean = ""
         retailer_vrn = ""
         colour = ""
         for i in product:
             name_product = i.xpath("./th/text()").get()
             value = i.xpath("./td/text()").get()
             value = value.strip() if value else None
             if not value:
                 value = i.xpath("./td/a/text()").get()
                 value = value.strip() if value else None
             if "Brand" in name_product:
                 brand = value
             if "Category" in name_product:
                 category = value
             if "Size" in name_product:
                 size = value
             if "Fruugo ID" in name_product:
                 fruugo_id = value
             if "EAN" in name_product:
                 ean = value
             if "Retailer VRN" in name_product:
                 retailer_vrn = value
             if "Colour" in name_product:
                 colour = value
         description = response.css(".js-product-description").xpath(
             "./text()").get()
         item = GmWorkItem()
         item["key"] = url_key
         item["good_name"] = good_name
         item["price"] = price
         item["shop_name"] = shop_name
         item["shop_url"] = shop_url
         item["brand"] = brand
         item["category"] = category
         item["size"] = size
         item["goods_id"] = fruugo_id
         item["ean"] = ean
         item["retailer_vrn"] = retailer_vrn
         item["colour"] = colour
         item["description"] = description
         yield item
     else:
         try_result = self.try_again(response, url=url_key)
         yield try_result
Пример #27
0
    def parse(self, response):
        youxiao = re.search("feedback-container|feedbackServer", response.text)
        meta = response.meta
        seller_id = meta.get("ownerMemberId")
        goods_id = meta.get("productId")
        page = meta.get("page")
        if not page:
            page = "1"
        if youxiao:
            comment_num_str = response.css(".fb-star-selector").xpath(
                "./em/text()").get()
            comment_num = 0
            if comment_num_str:
                match = re.search("(\d+)", comment_num_str)
                if match:
                    comment_num = match.group(1)
            rate_list = response.css(".rate-list").xpath("./li")
            rate = []
            for i in rate_list:
                comment_score_1 = i.xpath("./span[3]/text()").get()
                rate.append(comment_score_1)
            comment_distribution = str(rate)
            feedbacks = response.css(".feedback-list-wrap").xpath("./div")
            mouth = ""
            day_match = ""
            for i in feedbacks:
                user_name = i.css(".user-name").xpath("./a/text()").get()
                if not user_name:
                    user_name = i.css(".user-name").xpath("./text()").get()
                country = i.css(".user-country").xpath("./b/text()").get()
                comment_score = i.css(".star-view").xpath(
                    "./span/@style").get()
                # user_info = i.css(".user-order-info")
                # colour = user_info.xpath("./span[1]/text()").get()
                # Logistics = user_info.xpath("./span[2]/text()").get()
                buyer_feedback = i.css(".buyer-feedback")
                comment = buyer_feedback.xpath("./span[1]/text()").get()
                time = buyer_feedback.xpath("./span[2]/text()").get()
                item = GmWorkItem()
                item["seller_id"] = seller_id
                item["goods_id"] = goods_id
                item["current_page"] = page
                item["comment_num"] = comment_num
                item["comment_distribution"] = comment_distribution
                item["user_name"] = user_name
                item["country"] = country
                item["comment_score"] = comment_score
                item["comment"] = comment
                item["time"] = time
                yield item
                if time:
                    mouth_match = re.search("\d+ ([a-z]+) \d{4}",
                                            time,
                                            flags=re.I)
                    if mouth_match:
                        mouth = mouth_match.group(1)
                    day_match = re.search("(\d+) [a-z]+ \d{4}",
                                          time,
                                          flags=re.I)
                    if day_match:
                        day_match = day_match.group(1)

            page_mouth = self.mouth_dict.get(mouth.upper())

            if (page_mouth == self.current_mouth or page_mouth
                    == self.current_mouth - 1) and int(page) * 10 < int(
                        comment_num):  #这里跨年问题or page_mouth in [1,12,11,10,9]
                ownerMemberId = seller_id
                productId = goods_id
                page = int(page) + 1
                current_page = page - 1
                request = self.request(ownerMemberId, productId, str(page),
                                       str(current_page))
                yield request
        else:
            try_result = self.try_again(response,
                                        seller_id=seller_id,
                                        goods_id=goods_id,
                                        current_page=page)
            yield try_result
Пример #28
0
    def get_detail(self, response):
        meta = response.meta
        totle_num = meta.get("totle_num")
        page_num = meta.get("page_num")
        shop_id = meta.get("shop_id")
        seller_id = meta.get("seller_id")

        judge = 0
        try:
            json_str = json.loads(response.text)
            data = json_str.get("data")
            if not totle_num:
                totle = data.get("total")
                totle_num = int(totle / 20) + 1 if totle % 20 else int(totle /
                                                                       20)
            ret = data.get("ret")
            for i in ret:
                item = GmWorkItem()
                id = i.get("id")
                orders = i.get("orders")
                salePrice = i.get("salePrice")
                maxPrice = salePrice.get("maxPrice")
                minPrice = salePrice.get("minPrice")
                pcDetailUrl = i.get("pcDetailUrl")
                subject = i.get("subject")
                averageStar = i.get("averageStar")  #评分
                feedbacks = i.get("feedbacks")  #反馈数
                mediaId = i.get("mediaId")  #媒体id
                image350Url = i.get("image350Url")  #图片url
                tagResult = i.get("tagResult")  #标签

                item["shop_id"] = shop_id
                item["seller_id"] = seller_id
                item["totle_num"] = totle_num
                item["id"] = id
                item["orders"] = orders
                item["max_price"] = maxPrice
                item["min_price"] = minPrice
                item["goods_url"] = pcDetailUrl
                item["average_score"] = averageStar
                item["goods_name"] = subject
                item["comment_num"] = feedbacks
                item["media_id"] = mediaId
                item["img_url"] = image350Url
                item["tag"] = tagResult
                yield item

                if orders == 0:
                    judge = 1
            item_s = GmWorkItem()
            item_s["shop_id"] = shop_id
            item_s["source_code"] = json_str
            yield item_s

            if page_num >= totle_num or len(ret) < 20:
                judge = 1

            if judge == 0:
                page_num += 1
                url = "https://{}.aliexpress.com/{}".format(shop_id, page_num)
                meta = {
                    "totle_num": totle_num,
                    "page_num": page_num,
                    "shop_id": shop_id,
                    "seller_id": seller_id
                }
                yield scrapy.Request(url=url,
                                     callback=self.get_detail,
                                     method="GET",
                                     meta=meta,
                                     dont_filter=True)
        except Exception as e:
            try_result = self.try_again(response, shop_id, seller_id, page_num)
            yield try_result
Пример #29
0
    def parse_goodinfo(self, response):

        good_id = re.search(r'itm.+/(\d+)', response.url)

        if good_id != None:
            good_id = good_id.group(1)
        else:
            good_id = ' '

        html = response.body.decode()

        good_name = response.xpath('//h1[@id="itemTitle"]/text()').get()
        if good_name:
            good_name = good_name.strip().replace(',', ',')
        else:
            good_name = ' '

        price_dollar = response.xpath('//span[@id="prcIsum"]/@content').get()
        if price_dollar:
            price_dollar = price_dollar.strip().replace(',', '')
        else:
            price_dollar = ' '

        price_RMB = response.xpath(
            '//div[@id="prcIsumConv"]/span/text()').get()
        if price_RMB != None:
            price_RMB = price_RMB.split()[1].strip().replace(',', '')
        else:
            price_RMB = ' '

        project_location = response.xpath(
            '//span[@itemprop="availableAtOrFrom"]/text()').get()
        if project_location:
            project_location = project_location.strip().replace(',', ',')
        else:
            project_location = ' '

        brand = response.xpath('//span[@itemprop="name"]/text()').getall()
        if brand != []:
            brand = brand[-1].strip().replace(',', ',')
        else:
            brand = ' '

        seller_name = response.xpath(
            '//span[@class="mbg-nw"]/font/font/text()|//span[@class="mbg-nw"]/text()').get()
        if seller_name:
            seller_name = seller_name.strip().replace(',', ',')
        else:
            seller_name = ' '

        sales_count = response.xpath(
            '//a[@class="vi-txt-underline"]/text()').get()
        if sales_count != None:
            sales_count = sales_count.split()[0]
        else:
            sales_count = ' '

        cats = response.xpath('//li[@class="bc-w"]//span/text()').getall()

        if len(cats) == 0:
            cat_1 = cat_2 = cat_3 = cat_4 = cat_5 = cat_6 = ' '
        elif len(cats) == 1:
            cat_1 = cats[0].strip().replace(',', ',')
            cat_2 = cat_3 = cat_4 = cat_5 = cat_6 = ' '
        elif len(cats) == 2:
            cat_1, cat_2 = cats[0].strip().replace(
                ',', ','), cats[1].strip().replace(',', ',')
            cat_3 = cat_4 = cat_5 = cat_6 = ' '
        elif len(cats) == 3:
            cat_1, cat_2, cat_3 = cats[0].strip().replace(
                ',', ','), cats[1].strip().replace(',', ','), cats[2].strip().replace(',', ',')
            cat_4 = cat_5 = cat_6 = ' '
        elif len(cats) == 4:
            cat_1, cat_2, cat_3, cat_4 = cats[0].strip().replace(',', ','), cats[1].strip().replace(
                ',', ','), cats[2].strip().replace(',', ','), cats[3].strip().replace(',', ',')
            cat_5 = cat_6 = ' '
        elif len(cats) == 5:
            cat_1, cat_2, cat_3, cat_4, cat_5 = cats[0].strip().replace(',', ','), cats[1].strip().replace(
                ',', ','), cats[2].strip().replace(',', ','), cats[3].strip().replace(',', ','), cats[4].strip().replace(',', ',')
            cat_6 = ' '
        else:
            cat_1, cat_2, cat_3, cat_4, cat_5, cat_6, = cats[0].strip().replace(',', ','), cats[1].strip().replace(',', ','), cats[2].strip(
            ).replace(',', ','), cats[3].strip().replace(',', ','), cats[4].strip().replace(',', ','), cats[5].strip().replace(',', ',')

        item = GmWorkItem(good_id=good_id, good_name=good_name, price_dollar=price_dollar, price_RMB=price_RMB,
                            project_location=project_location, brand=brand, seller_name=seller_name,
                            sales_count=sales_count, cat_1=cat_1, cat_2=cat_2, cat_3=cat_3, cat_4=cat_4, cat_5=cat_5,
                            cat_6=cat_6, html=html)

        yield item