def get_rele_word(self, in_word, country): if country == 'us': url_add = 'https://completion.amazon.com/search/complete?method=completion&search-alias=aps&client=amazon-search-ui&mkt=1&fb=1&sc=1&q=' + in_word host = 'completion.amazon.com' elif country == 'ca': url_add = 'https://completion.amazon.com/search/complete?method=completion&mkt=7&l=en_CA&client=amazon-search-ui&search-alias=aps&qs=&cf=1&fb=1&sc=1&q=' + in_word host = 'completion.amazon.com' elif country == 'fr': url_add = 'https://completion.amazon.co.uk/search/complete?method=completion&mkt=5&l=fr_FR&client=amazon-search-ui&search-alias=aps&qs=&cf=1&fb=1&sc=1&q=' + in_word host = 'completion.amazon.co.uk' elif country == 'de': url_add = 'https://completion.amazon.co.uk/search/complete?method=completion&mkt=4&l=de_DE&client=amazon-search-ui&search-alias=aps&qs=&cf=1&fb=1&sc=1&q=' + in_word host = 'completion.amazon.co.uk' elif country == 'it': url_add = 'https://completion.amazon.co.uk/search/complete?method=completion&mkt=35691&l=it_IT&client=amazon-search-ui&search-alias=aps&qs=&cf=1&fb=1&sc=1&q=' + in_word host = 'completion.amazon.co.uk' elif country == 'jp': url_add = 'https://completion.amazon.co.jp/search/complete?method=completion&l=ja_JP&client=amazon-search-ui&search-alias=aps&qs=&cf=1&fb=1&sc=1&mkt=6&q=' + in_word host = 'completion.amazon.co.jp' elif country == 'es': url_add = 'https://completion.amazon.co.uk/search/complete?method=completion&mkt=44551&l=es_ES&client=amazon-search-ui&search-alias=aps&qs=&cf=1&fb=1&sc=1&q=' + in_word host = 'completion.amazon.co.uk' else: url_add = 'https://completion.amazon.co.uk/search/complete?method=completion&search-alias=aps&client=amazon-search-ui&mkt=3&fb=1&sc=1&q=' + in_word host = 'completion.amazon.co.uk' page = get_url(url_add, host=host) return json.loads(page.text)[1]
def get_reviews(self, country, asin, crontab): reviews_page = get_url( 'product-reviews/' + asin + '/ref=cm_cr_arp_d_viewopt_srt?sortBy=recent&pageNumber=1', country) if country in ['de', 'uk', 'fr', 'it', 'es']: star = (Selector(text=reviews_page.content).xpath( './/*[@id="cm_cr-product_info"]/div/div[1]/div[3]/span/text()'). extract()[0]).split(" ")[0] elif country == 'jp': star = (Selector(text=reviews_page.content).xpath( './/*[@id="cm_cr-product_info"]/div/div[1]/div[3]/span/text()'). extract()[0])[-3:] else: star = (Selector(text=reviews_page.content).xpath( './/*[@id="cm_cr-product_info"]/div/div[1]/div[3]/span/a/span/text()' ).extract()[0]).split(" ")[0] reviews = Selector(text=reviews_page.content).xpath( './/*[@id="cm_cr-product_info"]/div/div[1]/div[2]/div/div/div[2]/div/span/text()' ).extract()[0] reviews = int(re.sub(",", "", reviews)) inventory = Inventory.objects.filter(asin=asin)[0] inventory_time = Inventory_time.objects.filter(asin=inventory)[0] if reviews_page: inventory_time.reviews = reviews inventory_time.star = star if crontab: try: old_reviews = Inventory_time.objects.filter( asin=inventory)[1].reviews except: old_reviews = 0 reviews_add = reviews - old_reviews if reviews_add >= 0: inventory_time.reviews_add = reviews_add if crontab and not reviews_page: inventory_time.reviews = Inventory_time.objects.filter( asin=inventory)[1].reviews inventory_time.star = Inventory_time.objects.filter( asin=inventory)[1].star inventory_time.save()
def rank_in_web(self,asin,keyword,country='us'): try: sponsored=False info = {'rank':None,'page':None,'sponsored':False,'to':None} keyword_rank=Keyword_rank.objects.filter(asin=asin,country=country,word=keyword)[0] page=get_url('s?field-keywords='+keyword,country=country) keyword_rank.rank_url=page.url keyword_rank.save() if Selector(text=page.content).xpath('.//*[@id="noResultsTitle"]'): rank=Rank(keyword_rank=keyword_rank,page=0,number=0,sponsored=False,to=0) rank.save() info = {'rank':0,'page':0,'sponsored':False} return info #print(page.status_code, page.text[:200]) flag_1,flag_2 = True,True #//*[@id="s-result-count"] //*[@id="s-result-count"] if country=='jp': try: #len(Selector(text=page.content).xpath('.//li[@class="s-result-item celwidget "]')) >= \ #int(Selector(text=page.content).xpath('.//*[@id="s-result-count"]/text()').extract()[0][-4:-2]): item_amount=int(Selector(text=page.content).xpath('.//*[@id="s-result-count"]/text()').extract()[0][-4:-2]) except: flag_1 = False item_amount=len(Selector(text=page.content).xpath('.//li[@class="s-result-item celwidget "]')) else: if len(Selector(text=page.content).xpath('.//li[@class="s-result-item celwidget "]')): #if len(Selector(text=page.content).xpath('.//li[@class="s-result-item celwidget "]')) >= \ #int(Selector(text=page.content).xpath('.//*[@id="s-result-count"]/text()').extract()[0][2:4]): try: item_amount=int(Selector(text=page.content).xpath('.//*[@id="s-result-count"]/text()').extract()[0][2:4]) except: flag_1 = False item_amount=len(Selector(text=page.content).xpath('.//li[@class="s-result-item celwidget "]')) else: flag_2 = False #if len(Selector(text=page.content).xpath('.//li[@class="s-result-item s-result-card-for-container a-declarative celwidget "]')) >= \ #int(Selector(text=page.content).xpath('.//*[@id="s-result-count"]/text()').extract()[0][2:4]): try: item_amount=int(Selector(text=page.content).xpath('.//*[@id="s-result-count"]/text()').extract()[0][2:4]) except: flag_1 = False item_amount=len(Selector(text=page.content).xpath('.//li[@class="s-result-item s-result-card-for-container a-declarative celwidget "]')) print(item_amount) tree = fromstring(page.content) print(tree.findtext('.//title')) if tree.findtext('.//title')=='Robot Check' or tree.findtext('.//title')=='Amazon CAPTCHA': if len(keyword_rank.rank.all())>2: rank=Rank(page=keyword_rank.rank.first().page,number=keyword_rank.rank.first().number,sponsored=keyword_rank.rank.first().sponsored,keyword_rank=keyword_rank,to=0) rank.save() info = {'rank':0,'page':0,'sponsored':True} return info else: rank=Rank(keyword_rank=keyword_rank,page=0,number=0,sponsored=False,to=0) rank.save() info = {'rank':0,'page':0,'sponsored':True} return info #.//li[@class="s-result-item s-result-card-for-container a-declarative celwidget "] if flag_1: page_num=0 while True: print(tree.findtext('.//title'),"page",page_num+1) if country=='jp': fanwei=range(len(Selector(text=page.content).xpath('.//li[@class="s-result-item celwidget "]'))) elif flag_2: fanwei=range(len(Selector(text=page.content).xpath('.//li[@class="s-result-item celwidget "]'))) else: fanwei=range(len(Selector(text=page.content).xpath('.//li[@class="s-result-item s-result-card-for-container a-declarative celwidget "]'))) for j in fanwei: #try: #print(str(item_amount*page_num+j)) if Selector(text=page.content).xpath(".//*[@id='result_"+str(item_amount*page_num+j)+"']/@data-asin"): seller=(Selector(text=page.content).xpath(".//*[@id='result_"+str(item_amount*page_num+j)+"']/@data-asin").extract()[0]) print(seller) if seller==asin: try: if Selector(text=page.content).xpath(".//*[@id='result_"+str(item_amount*page_num+j)+"']/div/div/div/div[2]/h5/text()").extract()[0]: info = {'rank':j+1,'page':page_num+1,'sponsored':True,'to':page_num+1} print("搜索关键词%s:%s排名第%s页,第%s名(广告)" % (keyword,asin,page_num+1,j+1)) rank=Rank(keyword_rank=keyword_rank,page=info['page'],number=info['rank'],sponsored=info['sponsored'],to=info['page']) rank.save() sponsored=True except: info = {'rank':j+1,'page':page_num+1,'sponsored':False,'to':page_num+1} print("搜索关键词%s:%s排名第%s页,第%s名" % (keyword,asin,page_num+1,j+1)) rank=Rank(keyword_rank=keyword_rank,page=info['page'],number=info['rank'],sponsored=info['sponsored'],to=info['page']) rank.save() return info else: print("在页面找商品的逻辑错误") #except: # print('Sponsored:搜索关键词%s,排名第%s页,第%s名' % (keyword,page_num+1,j+1)) if Selector(text=page.content).xpath(".//*[@id='pagnNextLink']/@href") and page_num<20: time.sleep(0.2+random.random()*0.5) page=get_url((Selector(text=page.content).xpath(".//*[@id='pagnNextLink']/@href")).extract()[0],country=country) page_num += 1 else: if sponsored: return "仅发现广告排名" else: rank=Rank(keyword_rank=keyword_rank,page=0,number=0,sponsored=False,to=page_num+1) rank.save() info = {'rank':0,'page':0,'sponsored':False,'to':page_num+1} return info #if len(keyword_rank.rank.all())>2: #rank=Rank(page=keyword_rank.rank.first().page,number=keyword_rank.rank.first().number,sponsored=keyword_rank.rank.first().sponsored,keyword_rank=keyword_rank) else: if country=='jp': fanwei=range(len(Selector(text=page.content).xpath('.//li[@class="s-result-item celwidget "]'))) elif flag_2: fanwei=range(len(Selector(text=page.content).xpath('.//li[@class="s-result-item celwidget "]'))) else: fanwei=range(len(Selector(text=page.content).xpath('.//li[@class="s-result-item s-result-card-for-container a-declarative celwidget "]'))) for j in fanwei: try: if Selector(text=page.content).xpath(".//*[@id='result_"+str(j)+"']/@data-asin"): seller=(Selector(text=page.content).xpath(".//*[@id='result_"+str(j)+"']/@data-asin").extract()[0]) print(seller) else: print("在页面找商品的逻辑错误") if seller==asin: try: if Selector(text=page.content).xpath(".//*[@id='result_"+str(j)+"']/div/div/div/div[2]/h5/text()").extract()[0]: info = {'rank':j+1,'page':1,'sponsored':True} except: info = {'rank':j+1,'page':1,'sponsored':False} print("搜索关键词%s:%s排名第1页,第%s名" % (keyword,asin,j+1)) rank=Rank(keyword_rank=keyword_rank,page=info['page'],number=info['rank'],sponsored=info['sponsored'],to=1) rank.save() return info except: rank=Rank(keyword_rank=keyword_rank,page=0,number=0,sponsored=False,to=0) rank.save() info = {'rank':0,'page':0,'sponsored':False} return info except Exception as e: dt = datetime.datetime.now(pytz.utc) + datetime.timedelta(seconds=40) self.retry(eta=dt, exc=e, max_retries=2)
def review_collect(self,asin,country='us'): try: product=Product.objects.filter(asin=asin,country=country)[0] page=get_url('product-reviews/'+asin+'/ref=cm_cr_arp_d_viewopt_srt?sortBy=recent&pageNumber=1&reviewerType=all_reviews',country) tree = fromstring(page.content) print(tree.findtext('.//title')) if tree.findtext('.//title')=='Robot Check' or tree.findtext('.//title')=='Amazon CAPTCHA': info = {'to':0} return info if Selector(text=page.content).xpath('.//*[@id="noResultsTitle"]'): product.review_qa_complete_time=timezone.now() product.save() info = {'to':0} return info review_monitors={} if country in ['de','uk','fr','it','es']: star=(Selector(text=page.content).xpath('.//*[@id="cm_cr-product_info"]/div/div[1]/div[3]/span/text()').extract()[0]).split(" ")[0] elif country=='jp': star=(Selector(text=page.content).xpath('.//*[@id="cm_cr-product_info"]/div/div[1]/div[3]/span/text()').extract()[0])[-3:] else: star=(Selector(text=page.content).xpath('.//*[@id="cm_cr-product_info"]/div/div[1]/div[3]/span/a/span/text()').extract()[0]).split(" ")[0] reviews_quant=Selector(text=page.content).xpath('.//*[@id="cm_cr-product_info"]/div/div[1]/div[2]/div/div/div[2]/div/span/text()').extract()[0] reviews_quant=int(re.sub(",","",reviews_quant)) product.reviews_url=page.url product.star=star product.reviews_quant=reviews_quant product.save() print(reviews_quant) page_num=0 if reviews_quant>10: while True: try: for i in range(10): star=int(Selector(text=page.content).xpath(".//*[@class='a-section review']/div/div[1]/a[1]/@title").extract()[i][0:1]) customer_id=(Selector(text=page.content).xpath(".//*[@class='a-section review']/div/div[2]/span//a/@href").extract()[i]).split("/")[4] review_text=(Selector(text=page.content).xpath(".//*[@class='a-section review']/div/div[4]/span/text()").extract()[i]) review_time=Selector(text=page.content).xpath(".//*[@class='a-section review']/div/div[2]/span[4]/text()").extract()[i] if country != 'jp': review_time=review_time[3:] normal=Selector(text=page.content).xpath(".//*[@class='a-section review']/div").extract()[0] regex=re.compile('reviews:filter-action:push-state') #eview_chinese=translate(review_text) if regex.search(normal): normal=True else: normal=False review_monitors[customer_id]={'star':star,'customer_id':customer_id,'review_time':review_time,'review_text':review_text,'normal':normal} #review_monitors[customer_id]={'star':star,'customer_id':customer_id,'review_time':review_time,'review_text':review_text,'review_chinese':review_chinese,'normal':normal} print(page_num,len(review_monitors)) except: pass if Selector(text=page.content).xpath("//ul[@class='a-pagination']/li[@class='a-last']//a/@href") and page_num<200: time.sleep(0.2+random.random()*0.5) page=get_url((Selector(text=page.content).xpath("//ul[@class='a-pagination']/li[@class='a-last']//a/@href")).extract()[0],country=country) page_num += 1 else: break else: try: for i in range(10): star=int(Selector(text=page.content).xpath(".//*[@class='a-section review']/div/div[1]/a[1]/@title").extract()[i][0:1]) customer_id=(Selector(text=page.content).xpath(".//*[@class='a-section review']/div/div[2]/span//a/@href").extract()[i]).split("/")[4] review_text=(Selector(text=page.content).xpath(".//*[@class='a-section review']/div/div[4]/span/text()").extract()[i]) review_time=Selector(text=page.content).xpath(".//*[@class='a-section review']/div/div[2]/span[4]/text()").extract()[i] if country != 'jp': review_time=review_time[3:] normal=Selector(text=page.content).xpath(".//*[@class='a-section review']/div").extract()[i] #review_chinese=translate(review_text) regex=re.compile('reviews:filter-action:push-state') if regex.search(normal): normal=True else: normal=False review_monitors[customer_id]={'star':star,'customer_id':customer_id,'review_time':review_time,'review_text':review_text,'normal':normal} #review_monitors[customer_id]={'star':star,'customer_id':customer_id,'review_time':review_time,'review_text':review_text,'review_chinese':review_chinese,'normal':normal} print("length of reviews:",len(review_monitors)) except: pass product.review_qa_complete_time=timezone.now() product.save() for customer_id in review_monitors: #,review_chinese=review_monitors[customer_id]['review_chinese'] Review_detail.objects.get_or_create(product=product,star=review_monitors[customer_id]['star'],customer_id=customer_id, review_time=review_monitors[customer_id]['review_time'],review_text=review_monitors[customer_id]['review_text'], normal=review_monitors[customer_id]['normal']) info = {'to':page_num} return info #report = GlucoseCsvReport(product) #report.email(product.user, 'subject', 'message') except Exception as e: dt = datetime.now(pytz.utc) + timedelta(seconds=40) self.retry(eta=dt, exc=e, max_retries=2)
def qa_collect(self,asin,country='us'): try: product=Product.objects.filter(asin=asin,country=country)[0] page=get_url('ask/questions/asin/'+asin+'/ref=ask_ql_psf_ql_hza?sort=SUBMIT_DATE', country) tree = fromstring(page.content) print(tree.findtext('.//title')) if tree.findtext('.//title')=='Robot Check' or tree.findtext('.//title')=='Amazon CAPTCHA': info = {'to':0} return info if Selector(text=page.content).xpath('.//*[@id="noResultsTitle"]'): info = {'to':0} return info qa_collection={} if Selector(text=page.content).xpath("//ul[@class='a-pagination']/li[@class='a-last']//a/@href"): page_num=0 while True: boxes=Selector(text=page.content).xpath(".//*[@class='a-section askTeaserQuestions']/div[@class='a-fixed-left-grid a-spacing-base']") for box in boxes: answer_url,answer,answer_user,qa_time=None,None,None,None vote=int(box.xpath(".//ul[@class='vote voteAjax']/li[2]/span[1]/text()").extract()[0]) question=box.xpath(".//div[@class='a-fixed-left-grid a-spacing-small']//a[@class='a-link-normal']/text()").extract()[0] try: qa_time=box.xpath(".//div[@class='a-fixed-left-grid a-spacing-base']//div[@class='a-fixed-left-grid-col a-col-right']/span[2]/text()").extract()[-1:][0] except: pass try: if box.xpath(".//div[@class='a-fixed-left-grid a-spacing-base']//div[@class='a-fixed-left-grid-col a-col-right']/span[1]/text()").extract() and country != 'jp': answer=box.xpath(".//div[@class='a-fixed-left-grid a-spacing-base']//div[@class='a-fixed-left-grid-col a-col-right']/span[1]/text()").extract()[0] elif box.xpath(".//div[@class='a-fixed-left-grid a-spacing-base']//div[@class='a-fixed-left-grid-col a-col-right']/span[1]/text()").extract() and country == 'jp': answer=box.xpath(".//div[@class='a-fixed-left-grid a-spacing-base']//div[@class='a-fixed-left-grid-col a-col-right']/span[1]/text()").extract()[0] if answer == "": try: answer=" ".join(box.xpath(".//span[@class='askLongText']/text()").extract()).strip() except: pass else: answer=" ".join(box.xpath(".//span[@class='askLongText']/text()").extract()).strip() except: pass try: answer_user=box.xpath(".//div[@class='a-fixed-left-grid a-spacing-base']//div[@class='a-fixed-left-grid-col a-col-right']/span[2]/text()").extract()[0] except: pass try: answer_quan=box.xpath(".//div[@class='a-fixed-left-grid a-spacing-base']//div[@class='a-section a-spacing-none a-spacing-top-mini']/a/text()").extract()[0] answer_quan = re.search(r'\d+', answer_quan).group(0) except: pass try: answer_url=box.xpath(".//div[@class='a-fixed-left-grid a-spacing-base']//div[@class='a-section a-spacing-none a-spacing-top-mini']/a/@href").extract()[0] answer_url=country_url(country)[:-1]+answer_url #print("answer_url:",answer_url) except: pass #print(answer_user,qa_time) if answer_user == None: pass elif answer_user==qa_time: if country in['us','uk','ca','de']: name_date=re.split(' on |By |Von | am ', answer_user) elif country=='it': name_date=re.split(' in |Da ', answer_user) elif country=='fr': name_date=re.split(' le |Par ', answer_user) elif country=='es': name_date=re.split(' el |Por ', answer_user) elif country=='jp': name_date=re.split('投稿者: |、投稿日: ', answer_user) answer_user=name_date[1] qa_time=name_date[2] else: answer_user=re.split(' on |By |Von | am ', answer_user)[-1:][0] qa_time=re.split(' on |By |Von | am ', qa_time)[-1:][0] if answer_url and answer_quan: qa_collection[question]={'vote':vote,'question':question,'qa_time':qa_time.strip(),'answer':answer,'answer_user':answer_user.strip(),'answer_quan':answer_quan,'answer_url':answer_url} elif answer: qa_collection[question]={'vote':vote,'question':question,'qa_time':qa_time.strip(),'answer':answer,'answer_user':answer_user.strip()} print(len(qa_collection)) if Selector(text=page.content).xpath("//ul[@class='a-pagination']/li[@class='a-last']//a/@href") and page_num<200: time.sleep(2+random.random()*5) page=get_url((Selector(text=page.content).xpath("//ul[@class='a-pagination']/li[@class='a-last']//a/@href")).extract()[0],country=country) page_num += 1 else: break else: boxes=Selector(text=page.content).xpath(".//*[@class='a-section askTeaserQuestions']/div[@class='a-fixed-left-grid a-spacing-base']") for box in boxes: answer_url,answer,answer_user,qa_time=None,None,None,None vote=int(box.xpath(".//ul[@class='vote voteAjax']/li[2]/span[1]/text()").extract()[0]) question=box.xpath(".//div[@class='a-fixed-left-grid a-spacing-small']//a[@class='a-link-normal']/text()").extract()[0] try: qa_time=box.xpath(".//div[@class='a-fixed-left-grid a-spacing-base']//div[@class='a-fixed-left-grid-col a-col-right']/span[2]/text()").extract()[-1:][0] except: pass try: if box.xpath(".//div[@class='a-fixed-left-grid a-spacing-base']//div[@class='a-fixed-left-grid-col a-col-right']/span[1]/text()").extract(): answer=box.xpath(".//div[@class='a-fixed-left-grid a-spacing-base']//div[@class='a-fixed-left-grid-col a-col-right']/span[1]/text()").extract()[0] else: answer=" ".join(box.xpath(".//span[@class='askLongText']/text()").extract()).strip() except: pass try: answer_user=box.xpath(".//div[@class='a-fixed-left-grid a-spacing-base']//div[@class='a-fixed-left-grid-col a-col-right']/span[2]/text()").extract()[0] except: pass try: answer_quan=box.xpath(".//div[@class='a-fixed-left-grid a-spacing-base']//div[@class='a-section a-spacing-none a-spacing-top-mini']/a/text()").extract()[0] answer_quan = re.search(r'\d+', answer_quan).group(0) except: pass try: answer_url=box.xpath(".//div[@class='a-fixed-left-grid a-spacing-base']//div[@class='a-section a-spacing-none a-spacing-top-mini']/a/@href").extract()[0] answer_url=country_url(country)[:-1]+answer_url except: pass if answer_user == None: pass elif answer_user==qa_time: if country in['us','uk','ca','de']: name_date=re.split(' on |By |Von | am ', answer_user) elif country=='it': name_date=re.split(' in |Da ', answer_user) elif country=='fr': name_date=re.split(' le |Par ', answer_user) elif country=='es': name_date=re.split(' el |Por ', answer_user) elif country=='jp': name_date=re.split('投稿者: |、投稿日: ', answer_user) answer_user=name_date[1] qa_time=name_date[2] else: answer_user=re.split(' on |By |Von | am ', answer_user)[-1:][0] qa_time=re.split(' on |By |Von | am ', qa_time)[-1:][0] if answer_url and answer_quan: qa_collection[question]={'vote':vote,'question':question,'qa_time':qa_time,'answer':answer,'answer_user':answer_user,'answer_quan':answer_quan,'answer_url':answer_url} elif answer: qa_collection[question]={'vote':vote,'question':question,'qa_time':qa_time,'answer':answer,'answer_user':answer_user} for qa in qa_collection: try: num=qa_collection[qa]['answer_quan'] except: num="1" try: #if qa_collection[qa]['answer_url']: QA_detail.objects.get_or_create(product=product,vote=qa_collection[qa]['vote'],question=qa_collection[qa]['question'],qa_time=qa_collection[qa]['qa_time'], answer=qa_collection[qa]['answer'],answer_person=qa_collection[qa]['answer_user'],num=num,answer_url=qa_collection[qa]['answer_url']) except: QA_detail.objects.get_or_create(product=product,vote=qa_collection[qa]['vote'],question=qa_collection[qa]['question'],qa_time=qa_collection[qa]['qa_time'], answer=qa_collection[qa]['answer'],answer_person=qa_collection[qa]['answer_user'],num=num) #except: # pass #report = GlucoseCsvReport(product) #report.email(product.user, 'subject', 'message') except Exception as e: dt = datetime.now(pytz.utc) + timedelta(seconds=40) self.retry(eta=dt, exc=e, max_retries=2)
def title_sellers(page, product, country, initial): sell_items = {} counter = 12 not_first = False try: if Product_seller.objects.filter(product=product): not_first = True except: not_first = False while counter > 0: counter -= 1 for i in range(10): try: #//*[@id="olpOfferList"]/div/div/div[2]/div[3]/h3/span/a if Selector(text=page.content).xpath( ".//*[@id='olpOfferList']/div/div/div[" + str(i + 2) + "]/div[4]/h3/span//a/@href"): seller = (Selector(text=page.content).xpath( ".//*[@id='olpOfferList']/div/div/div[" + str(i + 2) + "]/div[4]/h3/span//a/@href").extract()[0] ).split("=")[-1] sell_url = country_url(country)[:-1] + (Selector( text=page.content).xpath( ".//*[@id='olpOfferList']/div/div/div[" + str(i + 2) + "]/div[4]/h3/span//a/@href").extract()[0]) name = fromstring(page.content).findtext( './/*[@id="olpOfferList"]/div/div/div[' + str(i + 2) + ']/div[4]/h3/span/a') else: seller = (Selector(text=page.content).xpath( ".//*[@id='olpOfferList']/div/div/div[" + str(i + 2) + "]/div[3]/h3/span//a/@href").extract()[0] ).split("=")[-1] sell_url = country_url(country)[:-1] + (Selector( text=page.content).xpath( ".//*[@id='olpOfferList']/div/div/div[" + str(i + 2) + "]/div[3]/h3/span//a/@href").extract()[0]) name = fromstring(page.content).findtext( './/*[@id="olpOfferList"]/div/div/div[' + str(i + 2) + ']/div[3]/h3/span/a') price = fromstring(page.content).findtext( './/*[@id="olpOfferList"]/div/div/div[' + str(i + 2) + ']/div[1]/span[1]').strip() sell_items[seller] = { 'seller': seller, 'name': name, 'price': price, 'sell_url': sell_url } print(sell_items) except: print('寻找页面元素的逻辑错误') #product_seller.mark_time=datetime.datetime.now() if Selector(text=page.content).xpath( "//ul[@class='a-pagination']//a/@href") != '#' and Selector( text=page.content).xpath( "//ul[@class='a-pagination']//a/@href"): url_path = Selector(text=page.content).xpath( "//ul[@class='a-pagination']//a/@href").extract()[0] page = get_url(url_path, country=country) if not sell_items: return "没有跟卖卖家" if initial: for seller_id in sell_items: product_seller = Product_seller( product=product, name=sell_items[seller_id]['name'], seller_id=seller_id, price=sell_items[seller_id]['price'], sell_url=sell_items[seller_id]['sell_url']) product_seller.save() else: changed = False for seller_id in sell_items: if not Product_seller.objects.filter(product=product, seller_id=seller_id): product_seller = Product_seller( product=product, name=sell_items[seller_id]['name'], sell_url=sell_items[seller_id]['sell_url'], seller_id=seller_id, price=sell_items[seller_id]['price']) product_seller.save() if not_first and not Seller_change.objects.filter( product=product, status='old', created__gte=timezone.now() - datetime.timedelta(days=1)): seller_change = Seller_change( product=product, status='new', name=sell_items[seller_id]['name'], sell_url=sell_items[seller_id]['sell_url'], seller_id=seller_id, price=sell_items[seller_id]['price'], created=datetime.datetime.now()) seller_change.save() changed = True for seller_id in sell_items: if Product_seller.objects.filter(product=product, seller_id=seller_id): Product_seller.objects.filter( product=product, seller_id=seller_id).update(flag=True) if changed and not_first: product_to_user = Product_to_user.objects.filter(product=product) users = product_to_user.values_list('user', flat=True) User = get_user_model() users = User.objects.filter(id__in=users) seller_change = Seller_change.objects.filter( product=product, created__gte=timezone.now() - datetime.timedelta(minutes=3)) sellers = seller_change.values_list('name', flat=True) for user in users: message = "\n".join([ u'{0},您好.'.format(user.username), u'{0}有跟卖出现:'.format(product.title), u'跟卖商家:', ','.join([seller for seller in sellers]), u'详情请见:', '/'.join(['amz668.com/follow_sale', product.slug]), u'直达亚马逊:{0}'.format(page.url) ]) send_email(user.email, message, '出现新的跟卖商品') for product_seller in Product_seller.objects.filter(product=product, flag=False): seller_change = Seller_change(product=product, status='old', name=product_seller.name, seller_id=product_seller.seller_id, price=product_seller.price, created=datetime.datetime.now()) seller_change.save() Product_seller.objects.filter(product=product, flag=False).delete() Product_seller.objects.filter(product=product).update(flag=False)
def asin_title(asin, country): return get_url('gp/offer-listing/' + asin + '/ref=olp_page_5?ie=UTF8&startIndex=500', country=country)
def review_monitor(self, asin, country, comp_star, crontab=False): product = Product.objects.filter(asin=asin)[0] url_add = 'product-reviews/' + asin + '/ref=cm_cr_arp_d_viewopt_srt?sortBy=recent&pageNumber=1' page = get_url(url_add, country=country) if not page: return '没有打开页面' #tree = fromstring(page.content) #print(tree.findtext('.//title'),page.content) review_monitors = {} try: for i in range(10): star = int( Selector(text=page.content).xpath( ".//*[@class='a-section review']/div/div[1]/a[1]/@title"). extract()[i][0:1]) customer_id = (Selector(text=page.content).xpath( ".//*[@class='a-section review']/div/div[2]/span//a/@href"). extract()[i]).split("/")[4] review_time = Selector(text=page.content).xpath( ".//*[@class='a-section review']/div/div[2]/span[4]/text()" ).extract()[i][3:] if star <= comp_star: review_monitors[customer_id] = { 'star': star, 'customer_id': customer_id, 'review_time': review_time } print(review_monitors) except: return '商品没有评论' if not crontab: for customer_id in review_monitors: review = Review( product=product, star=review_monitors[customer_id]['star'], customer_id=customer_id, review_time=review_monitors[customer_id]['review_time']) review.save() else: change = False for customer_id in review_monitors: if not Review.objects.filter( product=product, customer_id=customer_id, review_time=review_monitors[customer_id]['review_time']): change = True review = Review( product=product, star=review_monitors[customer_id]['star'], customer_id=customer_id, review_time=review_monitors[customer_id]['review_time']) review.save() if change: user = product.user reviews = Review.objects.filter(product=product, created__gte=timezone.now() - datetime.timedelta(minutes=3)) customer_ids = reviews.values_list('customer_id', flat=True) message = "\n".join([ u'{0},您好.'.format(user.username), '{0}有差评出现:'.format(product.title), u'评论人:', ','.join([customer_id for customer_id in customer_ids]), u'评论详情:', '/'.join(['amz668.com/review_monitor', product.slug]), u'直达亚马逊:{0}'.format(page.url) ]) send_email(user.email, message, '您跟踪的商品出现新的差评')