def _get_byb_merchant(self, html_code=None): if html_code is None: return '', '' parser = GoodsParser(html_code) sname = parser._get_seller_name() seller_id = parser._get_seller_id() print('_get_byb_merchant.sname', sname) print('_get_byb_merchant.seller_id', seller_id) if not sname: seller_id = '' return sname, seller_id
def make_url(asin, cid=0, url_type='goods', urltitle='', sessionId=''): url_tuple = tuple() if url_type == 'goods': url, referer = GoodsParser.make_goods_url(asin, urltitle=urltitle, sessionId=sessionId) url_tuple = (url, referer) if url_type == 'reviews': url = ReviewsParser.make_reviews_url(asin, urltitle=urltitle) url_tuple = (url, ) if url_type == 'tosell': url = TosellParser.make_tosell_url(asin) url_tuple = (url, ) if url_type == 'keyword': url = GoodsParser.make_search_url(asin, cid) url_tuple = (url, ) return url_tuple
def get_brand_from_parser(asin): # 根据asin获取url url, refer = BaseCrawler.make_url(asin) # 调用下载器方法获取页面数据 html_code = get_data_from_requests(url) # 调用商品解析的方法获取页面的品牌信息 print(222) brand = GoodsParser(html_code)._get_brand() return brand
def parser(self, html, html_type='', asin='', ip='', url='', ua='', info_log=None, debug_log=None, monitor_type=0, cookie=None, tosellSum=None, goods_html_code=None): '''只写了商品、评论、跟卖, 评论的html要求是一个html_list其余模块根据需要覆写此方法''' result = () is_error = False if html_type == 'goods': try: goods_datas, bsr_data = GoodsParser().parser_goods( html, asin, monitor_type, ip=ip, ua=ua, debug_log=debug_log, download_url=url, cookies=cookie) result = (goods_datas, bsr_data) except Exception as e: is_error = True self.debug_log.error('[%s] goods parser解析 [%s] 时 [%s]' % (ip, url, e)) if html_type == 'reviews': try: reviews_datas = ReviewsParser().reviews_parser( html, asin, ip=ip, download_url=url) result = (reviews_datas, ) except Exception as e: is_error = True self.debug_log.error('[%s] reviews parser解析 [%s] 时 [%s]' % (ip, url, e)) if html_type == 'tosell': try: tosell_info = TosellParser().tosell_parser( html, asin, tosellSum, ip=ip, download_url=url, goods_html_code=goods_html_code) result = (tosell_info, ) except Exception as e: is_error = True self.debug_log.error('[%s] tosell parser解析 [%s] 时 [%s]' % (ip, url, e)) return result, is_error
def get_brand_from_parser(asin, debug_log): # 根据asin获取url urls = BaseCrawler.make_url(asin) url = urls[0] refer = urls[1] # 获取ua ua = UaPond.get_new_ua() cookies = '' ip = '' ipQ = '' # 调用下载器方法获取页面数据 html_data, cookie, is_error = get_html_useRequest(url, ua, ip, cookies, debug_log, refer, ipQ, url_type='goods', asin=asin) # 调用商品解析的方法获取页面的品牌信息 brand = GoodsParser(html_data)._get_brand() return brand
def get_to_sell_price(html_code=None): return GoodsParser(html_code)._to_price(html_code)
def get_to_sell_sum(html_code=None): return GoodsParser(html_code)._to_sell(html_code)
from Crawler.goodsParser import GoodsParser goods = GoodsParser() with open('1.html', 'r', encoding='utf8') as f: html = f.read() goods.parser_goods(html, 'the_asin', '') count = goods._get_review_count(html_code=html) code = goods._get_review_rating(html_code=html) print(count) print(code)
def download(self, ip, asin_or_kw, url_dict): url_type = self.url_type asin = asin_or_kw monitor_type = url_dict.get('mtp') or 1 print('url type: ', url_type) url_md5key = url_dict.get('md5') or '' if not url_md5key: url_md5key = self.get_md5_key(asin + url_type) startTime = return_PST().strftime("%Y-%m-%d %H:%M:%S") time_now = lambda: time.time() time1 = time_now() ua = self.get_ua() self.debug_log.debug('goods ua: %s' % (ua)) # value_str = ip + ua # self.debug_log.debug('goods value_str: %s' % (value_str)) # cookMd5key = self.get_md5_key(value_str) cookMd5key = '' cookie = self.get_cookie(cookMd5key) print('\ngoodsCookie: ', cookie) # url_title = urlQ.get_urlTitle_from_string(asin) url_title = '' sessionId = '' # if cookie: # sessionId = cookie.get('session-id') retry = False old_dnum = url_dict.get('dnum') or 0 if old_dnum > 3: retry = True url, referer = GoodsParser.make_goods_url(asin, urltitle=url_title, sessionId=sessionId, retry=retry) cookies = cookie print('goods referer: %s' % (referer)) print('[ip %s] 工作中... [%s]' % (ip, url)) if url: print('goods_url: ', url) html, cookiesObj, is_error = self.get_html(url, ua, ip, cookies, referer, url_type=url_type, asin=asin) print('这是HTML') # with open('data/devtest/6_1.html', 'w', encoding='utf8') as f: # f.write(html) durl = url_dict.get('durl') or [] durl.append(url) url_dict['durl'] = list(set(durl)) url_dict['dnum'] = old_dnum + 1 if is_error: self.the_url_is_discard(asin, url_dict, url_type, url_md5key) msgInt = 6 proxyInfo = 'get Html error' self.record_log(asin, time1, msgInt, url_type, startTime, ip, proxyInfo) else: analyze = self.analyze_html(html, cookie, cookiesObj, ip, asin_or_kw, url_dict, cookMd5key, time1, startTime, html_type=url_type) if analyze and analyze != 404: # 获取url_title 并保存 # self.save_url_title(asin, html) cook = cookie if not cookie: cook = cookiesObj result, is_error = self.parser(html, html_type=url_type, asin=asin, ip=ip, ua=ua, debug_log=self.debug_log, monitor_type=monitor_type, cookie=cook, url=url) if is_error: self.the_url_is_discard(asin, url_dict, url_type, url_md5key) msgInt = 3 proxyInfo = 'get data error' self.record_log(asin, time1, msgInt, url_type, startTime, ip, proxyInfo) else: if not result: self.the_url_is_discard(asin, url_dict, url_type, url_md5key) msgInt = 2 proxyInfo = 'get data defeated' self.record_log(asin, time1, msgInt, url_type, startTime, ip, proxyInfo) else: goods_datas = result[0] if goods_datas: print('goods_datas:', goods_datas) qty, qtydt = inv_start(asin, html, cook, ua, goods_datas[asin]) print('qty, qtydt:', qty, qtydt) goods_datas[asin]['quantity'] = qty goods_datas[asin]['qtydt'] = qtydt print('goods_datas.add(qty, qtydt):', goods_datas) bsr_data = result[1] data_bytes = pickle.dumps(goods_datas) if bsr_data: # print('bsr_data1', bsr_data) bsrData_bytes = pickle.dumps(bsr_data) # print('bsrData_bytes', bsrData_bytes) self.dataQ.add_bsrData_to_queue( bsrData_bytes) # print('bsr data ok', bsr_data) # from pprint import pprint # pprint(goods_datas) result1 = self.dataQ.add_goods_data_to_queue( data_bytes) print(result1) if not result1: sys.exit() # self.dataQ.record_data_ok_times() # self.dataQ.record_goods_ok_times() self.save_success_asin_keyword( asin, url_type=url_type) msgInt = 1 proxyInfo = 'get data success' self.record_log(asin, time1, msgInt, url_type, startTime, ip, proxyInfo) else: self.the_url_is_discard(asin, url_dict, url_type, url_md5key) time.sleep(1) else: print(url_type, '没有url') self.the_url_is_discard(asin, url_dict, url_type, url_md5key) time.sleep(1)
def save_url_title(self, asin, html): urlTitle = GoodsParser.get_urltitle(asin, html) if urlTitle and len(urlTitle) <= 72: urlQ.add_urlTitle_to_string(asin, urlTitle)
def make_search_url(self, kw, cid): search_url = GoodsParser.make_search_url(kw, cid) return search_url
def is_page_not_found(self, html): return GoodsParser.is_page_not_found(html)
def is_RobotCheck(self, html): return GoodsParser.is_RobotCheck(html)
def download(self, asin_or_kw, url_dict): print(asin_or_kw, url_dict) url_type = self.url_type print(url_type) asin = asin_or_kw monitor_type = url_dict.get('mtp') or 1 print('url type: ', url_type) url_md5key = url_dict.get('md5') or '' if not url_md5key: url_md5key = self.get_md5_key(asin + url_type) startTime = return_PST().strftime("%Y-%m-%d %H:%M:%S") time_now = lambda: time.time() time1 = time_now() retry = False old_dnum = url_dict.get('dnum') or 0 if old_dnum > 3: retry = True url, referer = GoodsParser.make_goods_url(asin, retry=retry) if url: print('goods_url: ', url) html, cookiesObj, is_error = self.get_html(url, referer=referer, url_type=url_type, asin=asin) print('is_error:', is_error) durl = url_dict.get('durl') or [] durl.append(url) url_dict['durl'] = list(set(durl)) url_dict['dnum'] = old_dnum + 1 if is_error: self.the_url_is_discard(asin, url_dict, url_type, url_md5key) msgInt = 6 proxyInfo = 'get Html error' self.record_log(asin, time1, msgInt, url_type, startTime, proxyInfo) else: analyze = self.analyze_html(html, asin_or_kw, url_dict, time1, startTime, html_type=url_type) if analyze and analyze != 404: result, is_error = self.parser(html, html_type=url_type, asin=asin, debug_log=self.debug_log, monitor_type=monitor_type, url=url) if is_error: self.the_url_is_discard(asin, url_dict, url_type, url_md5key) msgInt = 3 proxyInfo = 'get data error' self.record_log(asin, time1, msgInt, url_type, startTime, proxyInfo) else: if not result: self.the_url_is_discard(asin, url_dict, url_type, url_md5key) msgInt = 2 proxyInfo = 'get data defeated' self.record_log(asin, time1, msgInt, url_type, startTime, proxyInfo) else: goods_datas = result[0] if goods_datas: cookies, headers = cookiesObj user_anget = headers.get('User-Agent') print(user_anget) from pprint import pprint pprint(cookies) pprint(goods_datas) msgInt = 1 proxyInfo = 'get data success' log_param = (asin, time1, msgInt, url_type, startTime, proxyInfo) start(asin=asin, goods_datas=goods_datas, user_anget=user_anget, url_dict=url_dict, goods_html=html, cookies=cookies, log_param=log_param, crawler_obj=self) else: self.the_url_is_discard(asin, url_dict, url_type, url_md5key) time.sleep(1) else: print(url_type, '没有url') self.the_url_is_discard(asin, url_dict, url_type, url_md5key) time.sleep(1)