def log(self, cls_name, method_name, message, type): """ Instantiate the logger obj and log to file depending on the msg type. """ logger = logging.getLogger("%s - %s:" % (cls_name, method_name)) # print the message out if the logger is set to be verbose if self.verbose: print u"%s: [%s - %s()] %s:" % (type.upper(), cls_name, method_name, message) # add extra params extras = { 'class': cls_name, 'method': method_name, } # determine the logging type and set as required - options are info, warning, error if type == 'info': if self.debug: logger.setLevel(logging.INFO) logger.info(jd_utils.encoding(message), extra=extras) elif type == 'warning': logger.setLevel(logging.WARNING) logger.warning(message, extra=extras) elif type == 'error': logger.setLevel(logging.ERROR) logger.error(message, extra=extras) raise Exception("%s: %s" % (type.upper(), message)) elif type == 'critical': logger.setLevel(logging.CRITICAL) logger.critical(message, extra=extras) raise Exception("%s: %s" % (type.upper(), message))
def get_product_ids(url, jdb, tid): flag = 0 while True: try: request = urllib.request.Request(url, headers = jd_headers) g_response = urllib.request.urlopen(request) if g_response.info().get('Content-Encoding') == 'gzip': g_read = zlib.decompress(g_response.read(), 16+zlib.MAX_WBITS) else: g_read = g_response.read() url_html = jd_utils.encoding(g_read) url_soup = BeautifulSoup(url_html) url_extend = url_soup.findAll('a', attrs = {"href": re.compile(r"^http://\w+.jd.com/.+\.(htm|html)$")}) break except http.client.IncompleteRead: continue except Exception as e: if flag > 3 : print ("网络异常,放弃展开URL") return if e.errno == errno.ECONNRESET: flag = flag + 1 time.sleep(20) print ("重试中...[%d]" % flag) continue print ("展开产品链接异常:" + str(e) ) return prds = [] no_prds = [] for url_item in url_extend: url_str = url_item.get("href") m = re.match(r'^http://item.jd.com/\d+.html$', url_str) if m: #jdb.db_insert_product(m.string) prds.append( m.string ) else: #("http://red.jd.com/", "http://tuan.jd.com/", "http://auction.jd.com/", "http://jr.jd.com/", "http://smart.jd.com/") if not re.match(r'^http://(help|red|tuan|auction|jr|smart|gongyi|app|en|media|m|myjd|chat|read|chongzhi|z|giftcard|fw|you|mobile|wiki|me).jd.com', url_str) and not re.match(r'^http://www.jd.com/compare/', url_str) and not re.match(r'^http://club.jd.com/consultation/', url_str) : no_prds.append( url_str ) #with gdb_lock: # jdb.db_insert_no_product(url_str) # Really need to do with database if prds or no_prds : with gdb_lock: print ('线程[%d] 插入数据库...' % tid) for item in prds: jdb.db_insert_product(item) for item in no_prds: jdb.db_insert_no_product(item)
def get_product_consults(self, product_url): result_path = jd_config.JDSPR_RESULT self.agent = None page_id = 1 product_id = int(product_url.split('.')[2].split('/')[1]) product_url = jd_item_url % product_id flag = 0 while True: try: self.agent = random_jd_header(product_url) request = urllib.request.Request(product_url, headers = self.agent) g_response = urllib.request.urlopen(request) if g_response.info().get('Content-Encoding') == 'gzip': g_read = zlib.decompress(g_response.read(), 16+zlib.MAX_WBITS) else: g_read = g_response.read() product_html = jd_utils.encoding(g_read) #操作正常 break except UnicodeDecodeError: print ("GBK/Unicode编解码错误!") return except http.client.IncompleteRead: continue except Exception as e: if flag > 3 : print ("咨询线程[%d]网络异常,放弃该产品" % self.tid) return if e.errno == errno.ECONNRESET: flag = flag + 1 time.sleep(10) print ("咨询线程[%d]重试中...[%d]"%( self.tid, flag) ) continue print ("1.其它异常:"+str(e)) return product_name = None product_ts = None product_soup = BeautifulSoup(product_html) product_name = product_soup.find('h1') #产品类别 product_type = product_soup.find('div', attrs={"class":"breadcrumb"}) if product_type: product_ts = product_type.findAll('a') if not product_name or not product_ts: print("1.产品名称和类别提取错误,返回!Check[%s]" % product_url) print(self.agent['User-Agent']) return result_file = None try: i = 0 for pt_item in product_ts: if pt_item: result_path = result_path + "/" + pt_item.string + "/" i = i + 1 #目录类别的深度 if i > 3: break else: print("2.提取产品名称和目录错误!:%s, Check[%s]" % ( str(e), product_url) ) return if not os.path.exists(result_path): os.makedirs(result_path) result_file = "%s/%d.txt"%(result_path,product_id) if os.path.exists(result_file): return #print ("产品保存地址:%s",result_path) print ("咨询线程[%d]正在处理商品 %d" % ( self.tid, product_id )) f = codecs.open(result_file, 'wb',encoding = 'utf-8') f.write("产品名称:" + product_name.text + "\n") except Exception as e: print("3.提取产品名称和目录错误!:%s, Check[%s]" % ( str(e), product_url) ) if result_file and os.path.exists(result_file): try: os.remove(result_file) except: pass return count = 0 while True: product_consult_url = jd_consult_url % ( product_id, page_id ) flag = 0 progress = "." while True: progress = progress + "." try: self.agent = random_jd_header(product_url) request = urllib.request.Request(product_consult_url, headers = self.agent) g_response = urllib.request.urlopen(request) if g_response.info().get('Content-Encoding') == 'gzip': g_read = zlib.decompress(g_response.read(), 16+zlib.MAX_WBITS) else: g_read = g_response.read() consult_html = jd_utils.encoding(g_read) #操作正常 break except UnicodeDecodeError: print ("GBK/Unicode编解码错误!") f.close() return except http.client.IncompleteRead: continue except Exception as e: if flag > 3 : print ("咨询线程[%d]网络异常,放弃该产品" % self.tid) f.close() return if e.errno == errno.ECONNRESET: flag = flag + 1 time.sleep(10) print ("咨询线程[%d]重试中...[%d]" % ( self.tid, flag) ) continue print ("2.其它异常:"+str(e)) f.close() return consult_soup = BeautifulSoup(consult_html) count = count + self.get_page_consult(consult_soup, f) if count == 0 and progress == "..": print("咨询线程[%d] - 商品咨询为空,删除商品文件:%s" %( self.tid , result_file)) if result_file and os.path.exists(result_file): try: os.remove(result_file) except: pass return pagination = consult_soup.find('div', attrs = {"class":"Pagination"}) if not pagination: break; if not pagination.findAll('a',attrs = {"class":"next"}) : break; else: page_id = page_id + 1; f.flush() print ("咨询线程[%d]处理完毕,咨询[%d] %d" % ( self.tid, count, product_id )) f.close()
def get_product_comments(self, product_url): result_path = "" self.agent = random_jd_header() page_id = 1 product_id = int(product_url.split('.')[2].split('/')[1]) product_url = jd_item_url % product_id flag = 0 while True: try: request = urllib.request.Request(product_url, headers = self.agent) g_response = urllib.request.urlopen(request) if g_response.info().get('Content-Encoding') == 'gzip': g_read = zlib.decompress(g_response.read(), 16+zlib.MAX_WBITS) else: g_read = g_response.read() product_html = jd_utils.encoding(g_read) #操作正常 break except UnicodeDecodeError: print ("GBK/Unicode编解码错误!") return except http.client.IncompleteRead: continue except Exception as e: if flag > 3 : print ("评论线程[%d]网络异常,放弃该产品" % self.tid) return if e.errno == errno.ECONNRESET: flag = flag + 1 time.sleep(10) print ("评论线程[%d]重试中...[%d]"%( self.tid, flag) ) continue print ("1.其它异常:"+str(e)) return product_name = None product_ts = None product_soup = BeautifulSoup(product_html) product_name = product_soup.find('h1') #产品类别 product_type = product_soup.find('div', attrs={"class":"breadcrumb"}) if product_type: product_ts = product_type.findAll('a') if not product_name or not product_ts: print("1.产品名称和类别提取错误,返回!Check[%s]" % product_url) print(self.agent['User-Agent']) return result_file = None try: i = 0 for pt_item in product_ts: if pt_item: result_path = result_path + "/" + pt_item.string + "/" i = i + 1 #目录类别的深度 if i > 3: break else: print("2.提取产品名称和目录错误!:%s, Check[%s]" % ( str(e), product_url) ) return #if not os.path.exists(result_path): # os.makedirs(result_path) result_file = "%s/%d_comm.txt"%(jd_config.JDSPR_RESULT_LOCAL, product_id) if os.path.exists(result_file): return #print ("产品保存地址:%s",result_path) print ("评论线程[%d]正在处理商品 %d" % ( self.tid, product_id )) f = codecs.open(result_file, 'wb',encoding = 'utf-8') f.write("产品名称:" + product_name.text + "\n") except Exception as e: print("3.提取产品名称和目录错误!:%s, Check[%s]" % ( str(e), product_url) ) if result_file and os.path.exists(result_file): try: os.remove(result_file) except: pass return count = 0 retries = 0 while True: # random page url to avoid block product_comment_url = jd_comment_url % ( product_id, random.randint(3,10), page_id ) #print ("=============> DOING... " + product_comment_url) flag = 0 progress = "." while True: progress = progress + "." try: self.agent = random_jd_header(product_comment_url) request = urllib.request.Request(product_comment_url, headers = self.agent) g_response = urllib.request.urlopen(request) if g_response.info().get('Content-Encoding') == 'gzip': g_read = zlib.decompress(g_response.read(), 16+zlib.MAX_WBITS) else: g_read = g_response.read() comment_html = jd_utils.encoding(g_read) #操作正常 break except UnicodeDecodeError: print ("GBK/Unicode编解码错误!") f.close() return except http.client.IncompleteRead: continue except Exception as e: if flag > 3 : print ("评论线程[%d]网络异常,放弃该产品" % self.tid) f.close() return if e.errno == errno.ECONNRESET: flag = flag + 1 time.sleep(2) print ("评论线程[%d]重试中...[%d]" % ( self.tid, flag) ) continue print ("2.其它异常:"+str(e)) f.close() return comment_soup = BeautifulSoup(comment_html) count_t = self.get_page_comment(comment_soup, product_comment_url , f) # Retry about max 10 times here: # I hate JD lucky_flag = 1 if count_t == 0: if retries < 10: retries = retries + 1 if count != 0: # Refresh user agent self.agent = random_jd_header(product_comment_url) time.sleep( random.randint(3, 9)) print("评论线程[%d] R[%d] %s" %( self.tid, retries, product_comment_url)) continue else: lucky_flag = 0 retries = 0 count = count + count_t if count == 0 and progress == "..": print("评论线程[%d] - 商品咨询为空,删除商品文件:%s" %( self.tid , result_file)) if result_file and os.path.exists(result_file): try: os.remove(result_file) except: pass return pagination = comment_soup.find('div', attrs = {"class":"pagin fr"}) if not pagination: break if not pagination.findAll('a',attrs = {"class":"next"}) : break else: page_id = page_id + 1; f.flush() print ("评论线程[%d]处理完毕,产品[%d],评论[%d],LUCK[%s],PATH[%s]" % ( self.tid, product_id, count, lucky_flag, result_path )) f.close() return (product_url, count, lucky_flag, result_path, product_id)
def get_product_ids(url, jdb, tid): flag = 0 while True: try: request = urllib.request.Request(url, headers=jd_headers) g_response = urllib.request.urlopen(request) if g_response.info().get('Content-Encoding') == 'gzip': g_read = zlib.decompress(g_response.read(), 16 + zlib.MAX_WBITS) else: g_read = g_response.read() url_html = jd_utils.encoding(g_read) url_soup = BeautifulSoup(url_html) url_extend = url_soup.findAll( 'a', attrs={ "href": re.compile(r"^http://\w+.jd.com/.+\.(htm|html)$") }) break except http.client.IncompleteRead: continue except Exception as e: if flag > 3: print("网络异常,放弃展开URL") return if e.errno == errno.ECONNRESET: flag = flag + 1 time.sleep(20) print("重试中...[%d]" % flag) continue print("展开产品链接异常:" + str(e)) return prds = [] no_prds = [] for url_item in url_extend: url_str = url_item.get("href") m = re.match(r'^http://item.jd.com/\d+.html$', url_str) if m: #jdb.db_insert_product(m.string) prds.append(m.string) else: #("http://red.jd.com/", "http://tuan.jd.com/", "http://auction.jd.com/", "http://jr.jd.com/", "http://smart.jd.com/") if not re.match( r'^http://(help|red|tuan|auction|jr|smart|gongyi|app|en|media|m|myjd|chat|read|chongzhi|z|giftcard|fw|you|mobile|wiki|me).jd.com', url_str) and not re.match( r'^http://www.jd.com/compare/', url_str) and not re.match( r'^http://club.jd.com/consultation/', url_str): no_prds.append(url_str) #with gdb_lock: # jdb.db_insert_no_product(url_str) # Really need to do with database if prds or no_prds: with gdb_lock: print('线程[%d] 插入数据库...' % tid) for item in prds: jdb.db_insert_product(item) for item in no_prds: jdb.db_insert_no_product(item)