def prerequest(self): queue = self.queue queue.put("prerequest") print("prerequest") GlobalTools.setbaseurl(self.baseurl) res = requests.get(self.url, headers=self.headers) self.res = res html = GlobalTools.getResponseContent(self.res) if html.find(id="add-to-cart-button") is None: if html.find(id="availability") is not None: # print "text" + html.find(id="availability").text url = self.baseurl + html.find( id="availability").find("a").get('href') self.second_url = url res = requests.get(url, headers=GlobalTools.getHeaders()) html = GlobalTools.getResponseContent(res) try: price = html.find(class_="olpOfferPrice").text.strip() self.unnormal_price = price print(price) shop = html.find(class_="olpSellerName").text self.unnormal_shop = shop print(shop) except: traceback.print_exc() self.normal_situation = False return False return True
def parse(self, sheet, currrow): queue = self.queue self.result = [currrow] print("") queue.put(u"商品链接") queue.put(self.url) print("商品链接:") print(self.url) self.html = GlobalTools.getResponseContent(self.res) self.geturl() self.getprice() self.getshopname() self.getbrand() self.getfirstavailable() self.getranking() self.getqa() self.getCategory() self.getstars() self.getreviewcount() self.getgoodreviewvote() #美国的reviewcount是在getgoodreviewvote中统计出来的,所以要重新计算一下 # if self.countrycode=="us": # self.adjustreviewcount() if self.us_reviews_need_adjust: self.getusviewcount() self.getfba() print(self.resultmap) return self.getresult()
def get_email(self, url): print("******************") print("url:" + url) headers = GlobalTools.getHeaders() headers['X-Requested-With'] = 'XMLHttpRequest' headers[ 'Referer'] = 'https://www.amazon.de/gp/profile/amzn1.account.AF3BW3DYKKEHMR4HSAFIQDM62QNQ/ref=cm_cr_getr_d_pdp?ie=UTF8' headers[ 'Cookie'] = 's_nr=1507517187408-New; s_vnum=1939517187408%26vn%3D1; s_dslv=1507517187409; x-acbde="6gxYYwpBpG20FBChzzu9sn?hypH9MpwKF0gVmk2LOxnYWw2uE@5B3Qh7Df?gkrXM"; at-acbde=Atza|IwEBIFPo-tRvBxygSgF8Ard63lJANpi78TG-8BUTC8ScSLLiUskUDIh0VMUwG_l8fsWqij5ArfksGmp6Ks52ZiYPS0bJeoDkACAtCZF6h3ePo0yqw9jdKVsq4edrTZPfLFYYYaRsbNyD2x09klSn7jKaU8Sn56Cr4VCIx_H8LObqLF2bX6Aq0EWW-O0PoBHgkdYI9iPhMo_2OHQjWuFAeinw0dU1M7X-SWBl2wB4FtzVXlQzarbwLjsHxXSaw2LwX3ENF6oCHOh73pPPnTX68JEedEkLu-sOSL2eZ5Whe7zJ2L76yyEzyjVXQpWbDdUqUP58MdLTNLfhCM5LkwWGmd7fuoLC1u7sZhBkJSA6oLQ0Q3kua5e8x0LfI3HfLZwC6qzrDJ6pheW0my98MFK4r9JaG85Z; sess-at-acbde="d7DXrZglD8+7+42k5qmlfFUxSpHJkUg8H1Dz17ZCU+U="; x-wl-uid=1WLJUGaYF93xUQuJRK3PCgsu0IJeaJoL7J/7XRaD4Men7E4FPUEro4vxW+rjyvLb9XCGGKFNM1yrtwZ9b9BK3yXkMKCav41q6XBiaxBqGmVWG1vMYfNxoP30XR5Otq5GKr5uenX7TA98=; session-token="1o+pNqOm6F7uZWrYdtDbU26LiB8ByJ40B64c+JFwPh3lkBt1MbUn+ha6qR3BaTgduMMVK1e1LjJ6pnoF+/r3c4PUBDfax7J+AGcgt2QiXkvMdVyLjyDowIQtWUbeHi6V4hfxIhgrYGcAyZ4x4keQvPaEHOW0v8t8akQV0nmi5sj1Jzu8pn162bmTw0XLP88olTMWGCWAeJlHGsXpCvyiS1VrFGHpgj2xSW3j5jdNi8DCjE4R7E+EqR+4BNFVQs+1KUR7bf9qBMWu3xT7DDe9KQ=="; session-id-time=2082754801l; session-id=261-5557160-1959728; ubid-acbde=258-5984155-0914160; csm-hit=0CP5W9ZYZNFE06XFCV0V+b-9KE07PDF4YD27JB8DFQQ|1509967781417' res = requests.get(url, headers) print(res.status_code) html = GlobalTools.getResponseContent(res) print(html) print("******************")
def getusviewcount(self): asin = self.asin url = "https://www.amazon.com/product-reviews/" + asin + "/ref=acr_dpx_see_all?ie=UTF8&showViewpoints=1" res = requests.get(url, headers=GlobalTools.getHeaders()) html = GlobalTools.getResponseContent(res) viewpoints = html.find_all(id=re.compile("viewpoint-")) if len(viewpoints) > 0: try: positive = viewpoints[0].find_all( attrs={"data-reftag": "cm_cr_arp_d_viewpnt_lft"})[0].text self.resultmap['positivereviewcount'] = int( positive.split("positive")[0].split("all")[1].strip()) except: pass if len(viewpoints) > 1: try: negtive = viewpoints[1].find_all( attrs={"data-reftag": "cm_cr_arp_d_viewpnt_rgt"})[0].text self.resultmap['negtivereviewcount'] = int( negtive.split("critical")[0].split("all")[1].strip()) except: pass print(viewpoints)
def get_imgs_by_product_url(self, url): res = requests.get(url) html = GlobalTools.getResponseContent(res) html.find(id="main-image-container").find("ul")