def get_Promo_Sku(sku_id): # http://cd.jd.com/promotion/v2?skuId=1279827&area=1_72_2799_123&cat=670,729,7311199999 api_url = 'http://cd.jd.com/promotion/v2?skuId=%s&area=1_72_2799_123&cat=670,729,7311199999' % sku_id json_str = "{}" try: json_str = url_utils.getWebResponse(api_url, 'gbk') except: try: json_str = url_utils.getWebResponse(api_url, 'gbk') except: try: json_str = url_utils.getWebResponse(api_url) except: pass ret_map = {} #print json_str try: obj = json.loads(json_str) if obj['quanStatus'] == 200: ret_map['quan'] = obj['quan'] if obj['adsStatus'] == 200: ret_map['ads'] = obj['ads'] if obj['promStatus'] == 200: ret_map['prom'] = obj['prom'] except: logging.error('JD_API::get_Promo_Sku() failed, sku_id = %s' % sku_id) return ret_map
def get_Promo_Sku(sku_id): # http://cd.jd.com/promotion/v2?skuId=1279827&area=1_72_2799_123&cat=670,729,7311199999 api_url = 'http://cd.jd.com/promotion/v2?skuId=%s&area=1_72_2799_123&cat=670,729,7311199999' %sku_id json_str = "{}" try: json_str = url_utils.getWebResponse(api_url,'gbk') except: try: json_str = url_utils.getWebResponse(api_url,'gbk') except: try: json_str = url_utils.getWebResponse(api_url) except: pass ret_map = {} #print json_str try: obj = json.loads(json_str) if obj['quanStatus']==200: ret_map['quan'] = obj['quan'] if obj['adsStatus']==200: ret_map['ads'] = obj['ads'] if obj['promStatus']==200: ret_map['prom'] = obj['prom'] except: logging.error('JD_API::get_Promo_Sku() failed, sku_id = %s' %sku_id) return ret_map
def __getPrices_JD_100__(sku_list): if len(sku_list) == 0: return [] api_url = __get_price_call_url___(sku_list) obj = json.loads(url_utils.getWebResponse(api_url)) # print 'obj len: %s' %len(obj) for item in obj: item['id'] = int(item['id']) item['p'] = float(item['p']) item['m'] = float(item['m']) if 'pcp' in item: item['pcp'] = float(item['pcp']) else: item['pcp'] = None return obj
def __getCommentCount_JD__(sku_list): # http://club.jd.com/clubservice.aspx?method=GetCommentsCount&referenceIds=1279827 sku_list2 = [] for item in sku_list: sku_list2.append('%s' %item) sku_str = ','.join(sku_list2) api_url = 'http://club.jd.com/clubservice.aspx?method=GetCommentsCount&referenceIds=%s' %sku_str json_str = url_utils.getWebResponse(api_url) ret_list = [] try: ret_map = json.loads(json_str) ret_list = ret_map['CommentsCount'] except: logging.error('JD_API::getCommentCount_JD() failed, sku_id = %s' %sku_id) return ret_list
def _get_Stock_Status(sku_list): # http://ss.3.cn/ss/areaStockState/mget?app=search_pc&ch=1&skuNum=1861098;1856588;1867038;1867670;1866550;1866973;1866564;1904606;1954504;1867014;1866686;1866577;1866958;1866661;1867024;1866945;2109985;2008714;2095246;2095250;2095272;2056957;2008804&area=1,2901,2906,0 sku_str_list = [] for sku in sku_list: sku_str_list.append('%s' % sku) sku_param = ';'.join(sku_str_list) api_url = "http://ss.3.cn/ss/areaStockState/mget?app=search_pc&ch=1&skuNum=%s&area=1,2901,2906,0" % sku_param # print api_url ret_dict_str = url_utils.getWebResponse(api_url) ret_dict = json.loads(ret_dict_str) ret_list = [] for key in ret_dict: ret_obj = ret_dict[key] ret_obj['sku_id'] = key ret_list.append(ret_obj) return ret_list
def __getCommentCount_JD__(sku_list): # http://club.jd.com/clubservice.aspx?method=GetCommentsCount&referenceIds=1279827 sku_list2 = [] for item in sku_list: sku_list2.append('%s' % item) sku_str = ','.join(sku_list2) api_url = 'http://club.jd.com/clubservice.aspx?method=GetCommentsCount&referenceIds=%s' % sku_str json_str = url_utils.getWebResponse(api_url) ret_list = [] try: ret_map = json.loads(json_str) ret_list = ret_map['CommentsCount'] except: logging.error('JD_API::getCommentCount_JD() failed, sku_id = %s' % sku_id) return ret_list
def _get_Stock_Status(sku_list): # http://ss.3.cn/ss/areaStockState/mget?app=search_pc&ch=1&skuNum=1861098;1856588;1867038;1867670;1866550;1866973;1866564;1904606;1954504;1867014;1866686;1866577;1866958;1866661;1867024;1866945;2109985;2008714;2095246;2095250;2095272;2056957;2008804&area=1,2901,2906,0 sku_str_list = [] for sku in sku_list: sku_str_list.append('%s' %sku) sku_param = ';'.join(sku_str_list) api_url = "http://ss.3.cn/ss/areaStockState/mget?app=search_pc&ch=1&skuNum=%s&area=1,2901,2906,0" %sku_param # print api_url ret_dict_str = url_utils.getWebResponse(api_url) ret_dict = json.loads(ret_dict_str) ret_list = [] for key in ret_dict: ret_obj = ret_dict[key] ret_obj['sku_id'] = key ret_list.append(ret_obj) return ret_list
def __get_detail_page_content__(sku_id): mc_key = 'JD_DETAIL_HTML9_%s' %sku_id mcv = mc.get(mc_key) if mcv is not None: return mcv html = "" try: url = __get_detail_page_url__(sku_id) html = url_utils.getWebResponse(url) html = html.decode('gbk') except Exception as e: try: html = html.decode('gb18030') except: logging.warning('url=%s, failed decoding using GBK or GB18030, using utf-8 now... may cause problems' %url) if len(html) > 0: mc.set(mc_key,html,MEMCACHE_DETAIL_HTML_TIMEOUT) return html
def get_Promo_Category(category_id): # http://cd.jd.com/promotion/v2?skuId=1&area=1_72_2799_123&cat=737%2C794%2C798 cat_id = category_id.replace('-','%2C') api_url = 'http://cd.jd.com/promotion/v2?skuId=1&area=1_72_2799_123&cat=%s' %cat_id json_str = url_utils.getWebResponse(api_url,'gbk') ret_map = {} #print json_str try: obj = json.loads(json_str) if obj['quanStatus']==200: ret_map['quan'] = obj['quan'] if obj['adsStatus']==200: ret_map['ads'] = obj['ads'] if obj['promStatus']==200: ret_map['prom'] = obj['prom'] except: logging.error('JD_API::get_Promo_Category() failed, category_id = %s' %category_id) return ret_map
def get_Promo_Category(category_id): # http://cd.jd.com/promotion/v2?skuId=1&area=1_72_2799_123&cat=737%2C794%2C798 cat_id = category_id.replace('-', '%2C') api_url = 'http://cd.jd.com/promotion/v2?skuId=1&area=1_72_2799_123&cat=%s' % cat_id json_str = url_utils.getWebResponse(api_url, 'gbk') ret_map = {} #print json_str try: obj = json.loads(json_str) if obj['quanStatus'] == 200: ret_map['quan'] = obj['quan'] if obj['adsStatus'] == 200: ret_map['ads'] = obj['ads'] if obj['promStatus'] == 200: ret_map['prom'] = obj['prom'] except: logging.error('JD_API::get_Promo_Category() failed, category_id = %s' % category_id) return ret_map
def loadCategoryList(): html = url_utils.getWebResponse(JD_CATEGORY_WEBSERVICE_URL, JD_ENC) json_str = url_utils.removeJsonP(html) obj = json.loads(json_str) clist = __extractCategoryList_fromJson__(obj) cat_list = [] for item in clist: print item vals = item.split('|') if len(vals) < 4: print 'error in length of category line' print item continue cat_name = vals[1] vals0 = vals[0] cat_id = cat_url = cat_memo = "" if '.com' in vals0: cat_url = vals0 else: cat_id = vals0 if len(vals[2]) > 0: cat_memo = vals[2] if len(cat_id) > 0: tp = (cat_id, cat_name, timeHelper.getNow()) cat_list.append(tp) # persist categories sql = 'replace into jd_category values(%s,%s,%s)' affected_rows = dbhelper.executeSqlWriteMany(sql, cat_list) print 'rows affected : jd_category : %s' % affected_rows return 0
def loadCategoryList(): html = url_utils.getWebResponse(JD_CATEGORY_WEBSERVICE_URL,JD_ENC) json_str = url_utils.removeJsonP(html) obj = json.loads(json_str) clist = __extractCategoryList_fromJson__(obj) cat_list = [] for item in clist: print item vals = item.split('|') if len(vals)<4: print 'error in length of category line' print item continue cat_name = vals[1] vals0 = vals[0] cat_id = cat_url = cat_memo = "" if '.com' in vals0: cat_url = vals0 else: cat_id = vals0 if len(vals[2]) > 0: cat_memo = vals[2] if len(cat_id) > 0: tp = (cat_id,cat_name, timeHelper.getNow()) cat_list.append(tp) # persist categories sql = 'replace into jd_category values(%s,%s,%s)' affected_rows = dbhelper.executeSqlWriteMany(sql,cat_list) print 'rows affected : jd_category : %s' %affected_rows return 0
def crawl_category(category_id): logging.debug('category_id = %s -- page 1' %(category_id)) url = __get_category_page_url__(category_id,1) # print url html = url_utils.getWebResponse(url,'utf-8') if html == "": html = url_utils.getWebResponse(url,'gb18030') if html == "": html = url_utils.getWebResponse(url, 'gbk') total_pages = jd_list_resolver.resolveTotalPageNum(html) product_list = jd_list_resolver.resolveProductListFromPage(html) while len(product_list) == 0 and category_id is not None: category_id = __up_roll_category_id__(category_id) return crawl_category(category_id) if category_id is None or len(product_list)==0: return {'status':-1, 'msg': 'No item in category product list'} for page_iter in range(2,total_pages+1): logging.debug('category_id = %s -- page %s' %(category_id,page_iter)) url = __get_category_page_url__(category_id,page_iter) html = url_utils.getWebResponse(url,'utf-8') product_list = product_list + jd_list_resolver.resolveProductListFromPage(html) time.sleep(SLEEP_TIME) sku_list = [] for product_tp in product_list: sku_id = product_tp[0] sku_list.append(sku_id) # Get price of all products #price_obj = jd_API.getPrices_JD(sku_list,sleep_time=SLEEP_PRICE_API) ret_obj = { 'status': -1, 'affected_rows': -1, 'sku_count': -1 } total_goods_num = len(product_list) # for item in product_list: # print item[0] # print '='*80 # combine product list and price list, timestamp, category_id for i in xrange(total_goods_num): product_id = product_list[i][0] pkey = '%s' %product_id # if pkey in price_obj: # product_list[i] = product_list[i] + (price_obj[pkey][0],price_obj[pkey][1],price_obj[pkey][2],) #nowdate,nowtime,) # else: # logging.error('Error: product_id=%s cannot get result' %(product_id,price_id)) # continue product_list[i] = product_list[i] + (0,0,0,) # persist in database # (sku_id,sku_title,sku_url,sku_thumnail_url,sku_stock,comment_count,is_global,is_pay_on_delivery,is_free_gift,sku_icon_url, price, price_m, update_date,update_time, category_id) # sql = ''' # replace into jd_item_dynamic (sku_id,title,url,thumbnail_url,stock_status,comment_count,is_global,is_pay_on_delivery, # has_free_gift,icon_url,price,price_m,price_pcp,update_date,update_time) values (%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s) # ''' # affected_rows = dbhelper.executeSqlWriteMany(sql,product_list) ret = crawler_helper.persist_db_history_and_latest( table_name='jd_item_dynamic', num_cols=len(product_list[0]), value_list=product_list, is_many=True, need_history=False, # was True - changed 01/03 need_flow=False, # was True - changed 12/23 ) logging.debug('Saved to DB -- category_id = %s -- sku_count=%s' %(category_id,total_goods_num)) logging.debug('%s' %ret) # HANDLE JD_ITEM_CATEGORY item_cat_list = [] for prod in product_list: item_cat_list.append((prod[0],category_id,)) sql2 = 'replace into jd_item_category values (%s,%s)' affected_rows2 = dbhelper.executeSqlWriteMany(sql2,item_cat_list) logging.debug('Saved to DB - item_category - affected rows = %s' %affected_rows2) if affected_rows2<=0: logging.error('Saving to item_category error, category_id = %s' %category_id) # HANDLE JD_ITEM_FIRSTSEEN nowtime = timeHelper.getNowLong() nowdate = timeHelper.getNow() sql3 = 'insert ignore into jd_item_firstseen values(%s,"%s","%s")' ftlist = [] for item in product_list: ftlist.append([item[0],nowtime,nowdate]) affected_rows3 = dbhelper.executeSqlWriteMany(sql3,ftlist) ret_obj = { 'status': 0 if ret['status']==0 and affected_rows2>0 else -1, 'item_dynamic': ret, 'item_category': affected_rows2, 'item_first_seen': affected_rows3, } return ret_obj
def crawl_category(category_id): logging.debug('category_id = %s -- page 1' % (category_id)) url = __get_category_page_url__(category_id, 1) # print url html = url_utils.getWebResponse(url, 'utf-8') if html == "": html = url_utils.getWebResponse(url, 'gb18030') if html == "": html = url_utils.getWebResponse(url, 'gbk') total_pages = jd_list_resolver.resolveTotalPageNum(html) product_list = jd_list_resolver.resolveProductListFromPage(html) while len(product_list) == 0 and category_id is not None: category_id = __up_roll_category_id__(category_id) return crawl_category(category_id) if category_id is None or len(product_list) == 0: return {'status': -1, 'msg': 'No item in category product list'} for page_iter in range(2, total_pages + 1): logging.debug('category_id = %s -- page %s' % (category_id, page_iter)) url = __get_category_page_url__(category_id, page_iter) html = url_utils.getWebResponse(url, 'utf-8') product_list = product_list + jd_list_resolver.resolveProductListFromPage( html) time.sleep(SLEEP_TIME) sku_list = [] for product_tp in product_list: sku_id = product_tp[0] sku_list.append(sku_id) # Get price of all products #price_obj = jd_API.getPrices_JD(sku_list,sleep_time=SLEEP_PRICE_API) ret_obj = {'status': -1, 'affected_rows': -1, 'sku_count': -1} total_goods_num = len(product_list) # for item in product_list: # print item[0] # print '='*80 # combine product list and price list, timestamp, category_id for i in xrange(total_goods_num): product_id = product_list[i][0] pkey = '%s' % product_id # if pkey in price_obj: # product_list[i] = product_list[i] + (price_obj[pkey][0],price_obj[pkey][1],price_obj[pkey][2],) #nowdate,nowtime,) # else: # logging.error('Error: product_id=%s cannot get result' %(product_id,price_id)) # continue product_list[i] = product_list[i] + ( 0, 0, 0, ) # persist in database # (sku_id,sku_title,sku_url,sku_thumnail_url,sku_stock,comment_count,is_global,is_pay_on_delivery,is_free_gift,sku_icon_url, price, price_m, update_date,update_time, category_id) # sql = ''' # replace into jd_item_dynamic (sku_id,title,url,thumbnail_url,stock_status,comment_count,is_global,is_pay_on_delivery, # has_free_gift,icon_url,price,price_m,price_pcp,update_date,update_time) values (%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s) # ''' # affected_rows = dbhelper.executeSqlWriteMany(sql,product_list) ret = crawler_helper.persist_db_history_and_latest( table_name='jd_item_dynamic', num_cols=len(product_list[0]), value_list=product_list, is_many=True, need_history=False, # was True - changed 01/03 need_flow=False, # was True - changed 12/23 ) logging.debug('Saved to DB -- category_id = %s -- sku_count=%s' % (category_id, total_goods_num)) logging.debug('%s' % ret) # HANDLE JD_ITEM_CATEGORY item_cat_list = [] for prod in product_list: item_cat_list.append(( prod[0], category_id, )) sql2 = 'replace into jd_item_category values (%s,%s)' affected_rows2 = dbhelper.executeSqlWriteMany(sql2, item_cat_list) logging.debug('Saved to DB - item_category - affected rows = %s' % affected_rows2) if affected_rows2 <= 0: logging.error('Saving to item_category error, category_id = %s' % category_id) # HANDLE JD_ITEM_FIRSTSEEN nowtime = timeHelper.getNowLong() nowdate = timeHelper.getNow() sql3 = 'insert ignore into jd_item_firstseen values(%s,"%s","%s")' ftlist = [] for item in product_list: ftlist.append([item[0], nowtime, nowdate]) affected_rows3 = dbhelper.executeSqlWriteMany(sql3, ftlist) ret_obj = { 'status': 0 if ret['status'] == 0 and affected_rows2 > 0 else -1, 'item_dynamic': ret, 'item_category': affected_rows2, 'item_first_seen': affected_rows3, } return ret_obj