def calculateSkuRatingScores(): sql_cb = ''' CREATE TABLE jd_analytic_rating_score_latest ( sku_id bigint(20) NOT NULL, comment_count int(11) NOT NULL, rating_score float DEFAULT NULL, category_id varchar(255) NOT NULL, this_update_time datetime NOT NULL, PRIMARY KEY (sku_id) -- KEY skuid (sku_id), -- KEY cat_score (rating_score,category_id), -- KEY score (rating_score), -- KEY category (category_id) ) ENGINE=InnoDB DEFAULT CHARSET=utf8 ''' sql = ''' select skuid as sku_id, CommentCount, ((a.Score1Count)*1.0+(a.Score2Count)*2.0+(a.Score3Count)*3.0+(a.Score4Count)*4.0+(a.Score5Count)*5.0)/a.CommentCount as rating_score, category_id, CURRENT_TIMESTAMP() as this_update_time from jd_item_comment_count_latest a left join jd_item_category b on a.SkuId = b.sku_id where a.CommentCount is not null and a.CommentCount >= %s ''' % (datamining_config. MIN_COMMENT_NUM_SO_RATING_SCORE_STATISTICALLY_SIGNIFICANT) retrows = dbhelper.executeSqlRead2(sql, is_dirty=True) ret = crawler_helper.persist_db_history_and_lastest_empty_first( table_name='jd_analytic_rating_score', num_cols=len(retrows[0]), value_list=retrows, need_history=False, is_many=True, sql_create_table=sql_cb, ) return ret
def calculateSkuRatingScores(): sql_cb = ''' CREATE TABLE jd_analytic_rating_score_latest ( sku_id bigint(20) NOT NULL, comment_count int(11) NOT NULL, rating_score float DEFAULT NULL, category_id varchar(255) NOT NULL, this_update_time datetime NOT NULL, PRIMARY KEY (sku_id) -- KEY skuid (sku_id), -- KEY cat_score (rating_score,category_id), -- KEY score (rating_score), -- KEY category (category_id) ) ENGINE=InnoDB DEFAULT CHARSET=utf8 ''' sql = ''' select skuid as sku_id, CommentCount, ((a.Score1Count)*1.0+(a.Score2Count)*2.0+(a.Score3Count)*3.0+(a.Score4Count)*4.0+(a.Score5Count)*5.0)/a.CommentCount as rating_score, category_id, CURRENT_TIMESTAMP() as this_update_time from jd_item_comment_count_latest a left join jd_item_category b on a.SkuId = b.sku_id where a.CommentCount is not null and a.CommentCount >= %s ''' %(datamining_config.MIN_COMMENT_NUM_SO_RATING_SCORE_STATISTICALLY_SIGNIFICANT) retrows = dbhelper.executeSqlRead2(sql,is_dirty=True) ret = crawler_helper.persist_db_history_and_lastest_empty_first( table_name='jd_analytic_rating_score', num_cols=len(retrows[0]), value_list=retrows, need_history=False, is_many=True, sql_create_table=sql_cb, ) return ret
def generate_index(): catdict = getMergedCategoryInfo() vlist = [] for key in catdict: vlist.append([key, catdict[key]]) sqlcb = ''' CREATE TABLE jd_index_category_latest ( category_id varchar(255) NOT NULL, category_text varchar(255) NOT NULL, PRIMARY KEY (category_id) ) ENGINE=InnoDB DEFAULT CHARSET=utf8 ''' return crawler_helper.persist_db_history_and_lastest_empty_first( table_name='jd_index_category', num_cols=len(vlist[0]), value_list=vlist, is_many=True, need_history=False, sql_create_table=sqlcb, )
def generate_index(): catdict = getMergedCategoryInfo() vlist = [] for key in catdict: vlist.append([key,catdict[key]]) sqlcb = ''' CREATE TABLE jd_index_category_latest ( category_id varchar(255) NOT NULL, category_text varchar(255) NOT NULL, PRIMARY KEY (category_id) ) ENGINE=InnoDB DEFAULT CHARSET=utf8 ''' return crawler_helper.persist_db_history_and_lastest_empty_first( table_name='jd_index_category', num_cols=len(vlist[0]), value_list=vlist, is_many=True, need_history=False, sql_create_table=sqlcb, )
def calculate_min_max_price(): logging.debug('Reading item_dynamic history and calculate min/max/avg/median price for skus...') t1 = time.time() dt = timeHelper.getTimeAheadOfNowHours(datamining_config.PRICE_RECENCY_HOURS, timeHelper.FORMAT_LONG) sql1 = ''' select sku_id, AVG(price) as average_price, min(price) as min_price, -- median(price) as median_price, -- changed 12/22 percentile_minx(price) as median_price, max(price) as max_price, max(update_time) as origin_time, count(1) as sample_count, min_ratio(price) as min_ratio, LPDR(price) as LPDR from -- jd_item_dynamic -- changed 12/22 jd_item_price where -- update_time > '2015-11-14 0:00:00' and -- 双十一期间价格 price > 0 group by sku_id having max(update_time) >= '%s' ''' %(dt) logging.debug(sql1) retrows = dbhelper.executeSqlRead2(sql1, is_dirty=True) logging.debug("Done, rows to insert: %s" %len(retrows) ) t2 = time.time() logging.debug('using seconds: %0.1f' %(t2-t1) ) sql_cb = ''' CREATE TABLE jd_analytic_price_stat_latest ( sku_id bigint(20) NOT NULL, average_price float NOT NULL, min_price float NOT NULL, median_price float NOT NULL, max_price float NOT NULL, origin_time datetime NOT NULL, sample_count int(11) NOT NULL, min_ratio float NOT NULL, LPDR float NOT NULL, PRIMARY KEY (sku_id), KEY skuid (sku_id) ) ENGINE=InnoDB DEFAULT CHARSET=utf8 ''' ret = crawler_helper.persist_db_history_and_lastest_empty_first( table_name='jd_analytic_price_stat', num_cols=len(retrows[0]), value_list=retrows, is_many=True, need_history=False, sql_create_table= sql_cb, ) return ret
def generate_category_property_mapping(): # sql = 'select * from jd_category' # retrows = dbhelper.executeSqlRead(sql) # # for row in retrows: # category_id = row['id'] # category_name = row['name'] black_list_clause = '","'.join(PROPERTY_KEY_BLACK_WORD_LIST) black_list_clause = '"%s"' %black_list_clause sql2 = ''' select sku_id, p_key, p_value, category_id from jd_item_property_latest a left join jd_item_category b using (sku_id) where LENGTH(p_value)>3 and LENGTH(p_value)<=30 and not (p_value like '%%个' and length(p_value)<=10) and p_key<>'__DEFAULT__' and LENGTH(p_key)>=6 and LENGTH(p_key)<=21 and p_key not like '%%重%%' and p_key not like '%%尺寸%%' and p_key not like '%%厚度%%' and p_key not like '%%宽度%%' and p_key not like '%%长宽高%%' and p_key not like '%%mm%%' and p_key <> '上架时间' and p_key NOT IN (%s) ''' %(black_list_clause) vlist = dbhelper.executeSqlRead2(sql2, is_dirty=True) sql_cb = ''' CREATE TABLE jd_analytic_property_latest ( sku_id bigint(20) DEFAULT NULL, p_key varchar(255) DEFAULT NULL, p_value varchar(255) DEFAULT NULL, category_id varchar(255) DEFAULT NULL, KEY skuid_categoryid (sku_id,category_id) ) ENGINE=InnoDB DEFAULT CHARSET=utf8 ''' print "now writing to db..." t1 = time.time() ret = crawler_helper.persist_db_history_and_lastest_empty_first( table_name='jd_analytic_property', num_cols=len(vlist[0]), value_list=vlist, is_many=True, need_history=False, sql_create_table=sql_cb, ) t2 = time.time() print "using : %0.0f" %(t2-t1) return ret
def calculatePercentile(): """ (1) load data (2) hash by key: category_id, ordered dict (3) for each item, find it's category_id array, got it's pos and percentile (4) store results in db :return: """ t1 = time.time() # STEP (1) print "step 1/4: reading data from rating_score_latest" sql = ''' select * from jd_analytic_rating_score_latest where rating_score is NOT NULL and comment_count is not NULL AND comment_count >= %s -- and category_id like "670-729-%%" order by comment_count DESC order by category_id -- limit 1000 ''' %datamining_config.MIN_SKU_NUM_PER_CATEGORY_SO_STATISTICALLY_SIGNIFICANT retrows = dbhelper.executeSqlRead(sql) t2 = time.time() print "Done, rows read: %s, seconds used: %0.1f" %(len(retrows), t2-t1) # STEP (2) print "step 2/4: sorting category scores..." print "" key_col = 'rating_score' tdict = rows_helper.transform_retrows_to_hashed_arrays(retrows, key_col_name='category_id') odict = {} for cat in tdict: array = tdict[cat] _get_ordered_array(array, key_col) odict[cat] = array t3 = time.time() print "Done, ordered_dict generated, num of keys = %s, time used = %0.1f" %(len(odict),t3-t2) print "" # STEP (3) print "step 3/4: calculate rating percentile for each sku..." #sku_dict = rows_helper.transform_retrows_to_dict(retrows, key_col_name='sku_id') for row in retrows: catid = row['category_id'] myval = row[key_col] pt = _getPercentileGreaterThan(myval,odict[catid],key_col) row['percentile_'+key_col] = pt row['sample_num'] = len(odict[catid]) if odict[catid] is not None else 0 # print "myval: %s\tpt: %s" %(myval,pt) t4 = time.time() print "Done, using seconds: %0.1f" %(t4-t3) print "" # Step (4) print 'step 4/4: storing results in db...' # for item in retrows: # for key in item: # print key # break sql_cb = ''' CREATE TABLE jd_analytic_rating_percentile_latest ( sku_id bigint(20) NOT NULL, comment_count int(11) NOT NULL, this_update_time datetime NOT NULL, rating_score float NOT NULL, category_id varchar(255) NOT NULL, rating_sample_num int(11) DEFAULT 0, percentile_rating_score float DEFAULT NULL, PRIMARY KEY (sku_id), KEY skuid (sku_id) -- KEY cat_score (rating_score,category_id), -- KEY score (rating_score), -- KEY category (category_id) ) ENGINE=InnoDB DEFAULT CHARSET=utf8 ''' vlist = rows_helper.transform_retrows_arrayofdicts_to_arrayoftuples(retrows) ret = crawler_helper.persist_db_history_and_lastest_empty_first( table_name='jd_analytic_rating_percentile', num_cols=len(vlist[0]), value_list=vlist, is_many=True, need_history=False, sql_create_table=sql_cb, ) t5 = time.time() print "Done, rows affected: %s, time used: %0.1f" %(ret, t5-t4) print "" return ret
def process_promo_detail(): today = timeHelper.getTimeAheadOfNowHours(datamining_config.PROMO_ITEM_RECENCY_HOURS,'%Y-%m-%d %H:%M:%S') # today = timeHelper.getTimeAheadOfNowDays(1) sql = ''' select a.*, b.price, d.id as category_id, d.name as category_name from jd_analytic_promo_item_latest a left join jd_item_price_latest b on a.sku_id = b.sku_id left JOIN jd_item_category c on a.sku_id = c.sku_id left join jd_category d on c.category_id = d.id where a.dt >= "%s" and b.sku_id is not NULL and b.price is not NULL ''' %today # logging.debug(sql) retrows = dbhelper.executeSqlRead(sql, is_dirty=True) vlist = [] vlist19 = [] dt = timeHelper.getNowLong() logging.debug('num total promo_item rows: %s' %len(retrows) ) # exit() num_15 = 0 num_19 = 0 num_15_repeated = 0 for row in retrows: sku_id = row['sku_id'] code = int(row['code']) content = row['content'] if 'content' in row else "" adurl = row['adurl'] if 'adurl' in row else "" origin_dt = row['dt'] pid = row['pid'] name = row['name'] if 'name' in row else "" price = float("%s" %row['price']) category_id = row['category_id'] category_name = row['category_name'] # title = row['title'] if code == 15: num_15 += 1 ret = _extract_reach_deduction_array(content) stat_has_repeat = False max_deduction = float(ret['max']) for item in ret['data']: try: reach = float(item[0]) deduction = float(item[1]) is_repeat = item[2] if is_repeat==1: stat_has_repeat = True dr_ratio = deduction*1.0/reach maxp_ratio = max_deduction*1.0/price if max_deduction > 0 else 1.0 could_deduct = 0 except Exception as e: logging.error("reach:%s, deduction:%s" %(reach,deduction) ) logging.error(e) continue if price >= reach and reach>0: if is_repeat: times = price // reach else: times = 1 could_deduct = times * deduction if could_deduct > max_deduction: could_deduct = max_deduction single_discount_rate = could_deduct/price tp =[sku_id, dt, price, is_repeat, reach, deduction, max_deduction, dr_ratio, maxp_ratio, single_discount_rate, category_id, category_name, pid, code, name, content, adurl, origin_dt] vlist.append(tp) if stat_has_repeat: num_15_repeated += 1 elif code == 19: sku_str = "%s" %sku_id num_19 += 1 # 满几件打折或者降低多少 type_word_list = ["总价打","商品价格"] # 0: 直接打折 # 1: 减商品价格 # 2: 其他 deduct_type = 0 for type_word in type_word_list: if content.find(type_word) >= 0: # deduct_type = 0 break deduct_type += 1 if deduct_type==2: logging.error("NEW TYPE OF DISCOUNT FOUND!!!") logging.error(content) logging.error("NEW TYPE OF DISCOUNT FOUND!!!") pt = re.compile(u'[\d.]+',re.U) pts = pt.findall(content) if len(pts) != 2: if '可购买热销商品' not in content: logging.error(content) logging.error("NEW PATTERN ABOVE") reach_num = discount = free_num = rf_ratio = None reach_num = float(pts[0]) if deduct_type==0: discount = pts[1] elif deduct_type==1: free_num = float(pts[1]) rf_ratio = float(free_num*1.0/reach_num) # tp19 =[sku_id, dt, title, price, deduct_type, reach_num, discount, free_num, rf_ratio, category_id, category_name, pid, code, name, content, adurl, origin_dt] tp19 =[sku_id, dt, price, deduct_type, reach_num, discount, free_num, rf_ratio, category_id, category_name, pid, code, name, content, adurl, origin_dt] vlist19.append(tp19) else: pass logging.debug("code = 15, cases = %s" %num_15) logging.debug("code = 15, repeated = %s" %num_15_repeated) logging.debug("rows to insert = %s" %len(vlist) ) sql_cb_deduction = ''' CREATE TABLE jd_analytic_promo_deduction_latest ( sku_id bigint(20) NOT NULL, add_time datetime NOT NULL, -- title varchar(255) NOT NULL, price float NOT NULL, is_repeat tinyint(4) NOT NULL, reach float NOT NULL, deduction float NOT NULL, max_deduction float NOT NULL, dr_ratio float NOT NULL, maxp_ratio float NOT NULL, single_discount_rate float NOT NULL, category_id varchar(255) NOT NULL, category_name varchar(255) DEFAULT NULL, pid varchar(255) NOT NULL, code varchar(255) NOT NULL, name varchar(255) NOT NULL, content varchar(255) NOT NULL, adurl varchar(255) DEFAULT NULL, origin_time datetime NOT NULL, KEY skuid (sku_id) ) ENGINE=InnoDB DEFAULT CHARSET=utf8 ''' sql_cb_discount = ''' CREATE TABLE jd_analytic_promo_discount_latest ( sku_id bigint(20) NOT NULL, add_time datetime NOT NULL, -- title varchar(255) DEFAULT NULL, price float DEFAULT NULL, deduct_type smallint(6) DEFAULT NULL, reach_num smallint(6) DEFAULT NULL, discount float DEFAULT NULL, free_num smallint(6) DEFAULT NULL, rf_ratio float DEFAULT NULL, category_id varchar(255) DEFAULT NULL, category_name varchar(255) DEFAULT NULL, pid varchar(255) NOT NULL, code varchar(255) NOT NULL, name varchar(255) NOT NULL, content varchar(255) NOT NULL, adurl varchar(255) DEFAULT NULL, origin_dt datetime DEFAULT NULL, PRIMARY KEY (sku_id,pid), KEY skuid (sku_id) ) ENGINE=InnoDB DEFAULT CHARSET=utf8 ''' pret15 = crawler_helper.persist_db_history_and_lastest_empty_first( table_name='jd_analytic_promo_deduction', num_cols=len(vlist[0]), value_list=vlist, is_many=True, need_history=False, sql_create_table=sql_cb_deduction, ) logging.debug("code = 19, cases = %s" %num_19 ) logging.debug("rows to insert = %s" %len(vlist19) ) pret19 = crawler_helper.persist_db_history_and_lastest_empty_first( table_name='jd_analytic_promo_discount', num_cols=len(vlist19[0]), value_list=vlist19, is_many=True, need_history=False, sql_create_table=sql_cb_discount, ) return _generate_mixed_ret([pret15, pret19])
def processItemPromo(): vlist = [] glist = [] update_date = timeHelper.getNowLong() recent = timeHelper.getTimeAheadOfNowHours(datamining_config.PROMO_ITEM_RECENCY_HOURS,timeHelper.FORMAT_LONG) logging.debug('Reading jd_promo_item_latest...' ) sql = ''' select sku_id, dt, promo_json from jd_promo_item_latest where promo_json is not NULL and LENGTH(promo_json)>100 and dt>="%s" ''' %recent retrows = dbhelper.executeSqlRead(sql,is_dirty=True) # total_rows = len(retrows) num_error = 0 num17 = 0 logging.debug('completed!') logging.debug("Total rows with promo_json: %s" %len(retrows)) for row in retrows: sku_id = row['sku_id'] dt = row['dt'] obj = None try: obj = json.loads(row['promo_json']) except: num_error += 1 continue rtags = obj['pickOneTag'] for tag in rtags: pid = tag['pid'] code = tag['code'] # 不记录加价购 if code == "17": num17 += 1 continue name = tag['name'] content = tag['content'] adurl = tag['adurl'] if 'adurl' in tag else "" tp = [sku_id, dt, pid, code, name, content, adurl, update_date] vlist.append(tp) tags = obj['tags'] for tag in tags: pid = tag['pid'] code = tag['code'] name = tag['name'] if 'name' in tag else "" if code == "10": # gift gifts = tag['gifts'] for gift in gifts: gift_name = "赠品" try: gift_name = gift['nm'] gift_num = gift['num'] if 'num' in gift else 1 gift_image = gift['mp'] if 'mp' in gift else "" gift_sku_id = gift['sid'] if 'sid' in gift else "" gift_gt = gift['gt'] if 'gt' in gift else "" gift_gs = gift['gs'] if 'gs' in gift else "" tp_gift = [sku_id,dt,pid,code, name, gift_name, gift_num, gift_image, gift_sku_id, gift_gt, gift_gs, update_date] glist.append(tp_gift) except Exception as e: logging.debug("error in extracting gift info for sku_id = %s"%sku_id) logging.debug("%s" %e) else: content = tag['content'] adurl = tag['adurl'] if 'adurl' in tag else "" tp = [sku_id, dt, pid, code, name, content, adurl, update_date] vlist.append(tp) logging.error("IGNOR-ABLE: num of errors: %s (like json.loads error)" %num_error) logging.debug('num17: %s' %num17 ) logging.debug('vlist len: %s' %len(vlist) ) logging.debug('glist len: %s' %len(glist) ) sql_cb_promo_item = ''' CREATE TABLE jd_analytic_promo_item_latest ( sku_id bigint(20) NOT NULL, dt datetime NOT NULL, pid varchar(255) NOT NULL, code varchar(255) NOT NULL, name varchar(255) NOT NULL, content varchar(255) NOT NULL, adurl varchar(255) DEFAULT NULL, update_date datetime NOT NULL, PRIMARY KEY (sku_id,pid) ) ENGINE=InnoDB DEFAULT CHARSET=utf8 ''' sql_cb_promo_gift = ''' CREATE TABLE jd_analytic_promo_gift_latest ( sku_id bigint(20) NOT NULL, dt datetime NOT NULL, pid varchar(255) NOT NULL, code varchar(255) NOT NULL, name varchar(255) NOT NULL, gift_name varchar(255) NOT NULL, gift_num int(11) NOT NULL, gift_image varchar(255) DEFAULT NULL, gift_sku_id bigint(20) NOT NULL, gift_gt varchar(255) DEFAULT NULL, gift_gs varchar(255) DEFAULT NULL, update_date datetime NOT NULL, PRIMARY KEY (sku_id,pid) ) ENGINE=InnoDB DEFAULT CHARSET=utf8 ''' # persist in DB ret1 = ret2 = None if len(vlist)>0: ret1 = crawler_helper.persist_db_history_and_lastest_empty_first( table_name='jd_analytic_promo_item', num_cols=len(vlist[0]), value_list=vlist, is_many=True, need_history=False, sql_create_table=sql_cb_promo_item, ) if len(glist)>0: ret2 = crawler_helper.persist_db_history_and_lastest_empty_first( table_name='jd_analytic_promo_gift', num_cols=len(glist[0]), value_list=glist, is_many=True, need_history=False, sql_create_table=sql_cb_promo_gift, ) # record gift sglist = [] cur_time = timeHelper.getNowLong() for gg in glist: sglist.append([gg[0],cur_time]) sql_gg = 'insert ignore into jd_analytic_sku_gift values(%s,%s)' afr = dbhelper.executeSqlWriteMany(sql_gg,sglist) ret3 = { 'status': 0 if afr > 0 else -1, 'msg': "", } return _generate_mixed_ret([ret1,ret2, ret3])
def calculatePercentile(): """ (1) load data (2) hash by key: category_id, ordered dict (3) for each item, find it's category_id array, got it's pos and percentile (4) store results in db :return: """ t1 = time.time() # STEP (1) print "step 1/4: reading data from rating_score_latest" sql = ''' select * from jd_analytic_rating_score_latest where rating_score is NOT NULL and comment_count is not NULL AND comment_count >= %s -- and category_id like "670-729-%%" order by comment_count DESC order by category_id -- limit 1000 ''' % datamining_config.MIN_SKU_NUM_PER_CATEGORY_SO_STATISTICALLY_SIGNIFICANT retrows = dbhelper.executeSqlRead(sql) t2 = time.time() print "Done, rows read: %s, seconds used: %0.1f" % (len(retrows), t2 - t1) # STEP (2) print "step 2/4: sorting category scores..." print "" key_col = 'rating_score' tdict = rows_helper.transform_retrows_to_hashed_arrays( retrows, key_col_name='category_id') odict = {} for cat in tdict: array = tdict[cat] _get_ordered_array(array, key_col) odict[cat] = array t3 = time.time() print "Done, ordered_dict generated, num of keys = %s, time used = %0.1f" % ( len(odict), t3 - t2) print "" # STEP (3) print "step 3/4: calculate rating percentile for each sku..." #sku_dict = rows_helper.transform_retrows_to_dict(retrows, key_col_name='sku_id') for row in retrows: catid = row['category_id'] myval = row[key_col] pt = _getPercentileGreaterThan(myval, odict[catid], key_col) row['percentile_' + key_col] = pt row['sample_num'] = len( odict[catid]) if odict[catid] is not None else 0 # print "myval: %s\tpt: %s" %(myval,pt) t4 = time.time() print "Done, using seconds: %0.1f" % (t4 - t3) print "" # Step (4) print 'step 4/4: storing results in db...' # for item in retrows: # for key in item: # print key # break sql_cb = ''' CREATE TABLE jd_analytic_rating_percentile_latest ( sku_id bigint(20) NOT NULL, comment_count int(11) NOT NULL, this_update_time datetime NOT NULL, rating_score float NOT NULL, category_id varchar(255) NOT NULL, rating_sample_num int(11) DEFAULT 0, percentile_rating_score float DEFAULT NULL, PRIMARY KEY (sku_id), KEY skuid (sku_id) -- KEY cat_score (rating_score,category_id), -- KEY score (rating_score), -- KEY category (category_id) ) ENGINE=InnoDB DEFAULT CHARSET=utf8 ''' vlist = rows_helper.transform_retrows_arrayofdicts_to_arrayoftuples( retrows) ret = crawler_helper.persist_db_history_and_lastest_empty_first( table_name='jd_analytic_rating_percentile', num_cols=len(vlist[0]), value_list=vlist, is_many=True, need_history=False, sql_create_table=sql_cb, ) t5 = time.time() print "Done, rows affected: %s, time used: %0.1f" % (ret, t5 - t4) print "" return ret
def calculate_min_max_price(): logging.debug( 'Reading item_dynamic history and calculate min/max/avg/median price for skus...' ) t1 = time.time() dt = timeHelper.getTimeAheadOfNowHours( datamining_config.PRICE_RECENCY_HOURS, timeHelper.FORMAT_LONG) sql1 = ''' select sku_id, AVG(price) as average_price, min(price) as min_price, -- median(price) as median_price, -- changed 12/22 percentile_minx(price) as median_price, max(price) as max_price, max(update_time) as origin_time, count(1) as sample_count, min_ratio(price) as min_ratio, LPDR(price) as LPDR from -- jd_item_dynamic -- changed 12/22 jd_item_price where -- update_time > '2015-11-14 0:00:00' and -- 双十一期间价格 price > 0 group by sku_id having max(update_time) >= '%s' ''' % (dt) logging.debug(sql1) retrows = dbhelper.executeSqlRead2(sql1, is_dirty=True) logging.debug("Done, rows to insert: %s" % len(retrows)) t2 = time.time() logging.debug('using seconds: %0.1f' % (t2 - t1)) sql_cb = ''' CREATE TABLE jd_analytic_price_stat_latest ( sku_id bigint(20) NOT NULL, average_price float NOT NULL, min_price float NOT NULL, median_price float NOT NULL, max_price float NOT NULL, origin_time datetime NOT NULL, sample_count int(11) NOT NULL, min_ratio float NOT NULL, LPDR float NOT NULL, PRIMARY KEY (sku_id), KEY skuid (sku_id) ) ENGINE=InnoDB DEFAULT CHARSET=utf8 ''' ret = crawler_helper.persist_db_history_and_lastest_empty_first( table_name='jd_analytic_price_stat', num_cols=len(retrows[0]), value_list=retrows, is_many=True, need_history=False, sql_create_table=sql_cb, ) return ret
def process_promo_detail(): today = timeHelper.getTimeAheadOfNowHours( datamining_config.PROMO_ITEM_RECENCY_HOURS, '%Y-%m-%d %H:%M:%S') # today = timeHelper.getTimeAheadOfNowDays(1) sql = ''' select a.*, b.price, d.id as category_id, d.name as category_name from jd_analytic_promo_item_latest a left join jd_item_price_latest b on a.sku_id = b.sku_id left JOIN jd_item_category c on a.sku_id = c.sku_id left join jd_category d on c.category_id = d.id where a.dt >= "%s" and b.sku_id is not NULL and b.price is not NULL ''' % today # logging.debug(sql) retrows = dbhelper.executeSqlRead(sql, is_dirty=True) vlist = [] vlist19 = [] dt = timeHelper.getNowLong() logging.debug('num total promo_item rows: %s' % len(retrows)) # exit() num_15 = 0 num_19 = 0 num_15_repeated = 0 for row in retrows: sku_id = row['sku_id'] code = int(row['code']) content = row['content'] if 'content' in row else "" adurl = row['adurl'] if 'adurl' in row else "" origin_dt = row['dt'] pid = row['pid'] name = row['name'] if 'name' in row else "" price = float("%s" % row['price']) category_id = row['category_id'] category_name = row['category_name'] # title = row['title'] if code == 15: num_15 += 1 ret = _extract_reach_deduction_array(content) stat_has_repeat = False max_deduction = float(ret['max']) for item in ret['data']: try: reach = float(item[0]) deduction = float(item[1]) is_repeat = item[2] if is_repeat == 1: stat_has_repeat = True dr_ratio = deduction * 1.0 / reach maxp_ratio = max_deduction * 1.0 / price if max_deduction > 0 else 1.0 could_deduct = 0 except Exception as e: logging.error("reach:%s, deduction:%s" % (reach, deduction)) logging.error(e) continue if price >= reach and reach > 0: if is_repeat: times = price // reach else: times = 1 could_deduct = times * deduction if could_deduct > max_deduction: could_deduct = max_deduction single_discount_rate = could_deduct / price tp = [ sku_id, dt, price, is_repeat, reach, deduction, max_deduction, dr_ratio, maxp_ratio, single_discount_rate, category_id, category_name, pid, code, name, content, adurl, origin_dt ] vlist.append(tp) if stat_has_repeat: num_15_repeated += 1 elif code == 19: sku_str = "%s" % sku_id num_19 += 1 # 满几件打折或者降低多少 type_word_list = ["总价打", "商品价格"] # 0: 直接打折 # 1: 减商品价格 # 2: 其他 deduct_type = 0 for type_word in type_word_list: if content.find(type_word) >= 0: # deduct_type = 0 break deduct_type += 1 if deduct_type == 2: logging.error("NEW TYPE OF DISCOUNT FOUND!!!") logging.error(content) logging.error("NEW TYPE OF DISCOUNT FOUND!!!") pt = re.compile(u'[\d.]+', re.U) pts = pt.findall(content) if len(pts) != 2: if '可购买热销商品' not in content: logging.error(content) logging.error("NEW PATTERN ABOVE") reach_num = discount = free_num = rf_ratio = None reach_num = float(pts[0]) if deduct_type == 0: discount = pts[1] elif deduct_type == 1: free_num = float(pts[1]) rf_ratio = float(free_num * 1.0 / reach_num) # tp19 =[sku_id, dt, title, price, deduct_type, reach_num, discount, free_num, rf_ratio, category_id, category_name, pid, code, name, content, adurl, origin_dt] tp19 = [ sku_id, dt, price, deduct_type, reach_num, discount, free_num, rf_ratio, category_id, category_name, pid, code, name, content, adurl, origin_dt ] vlist19.append(tp19) else: pass logging.debug("code = 15, cases = %s" % num_15) logging.debug("code = 15, repeated = %s" % num_15_repeated) logging.debug("rows to insert = %s" % len(vlist)) sql_cb_deduction = ''' CREATE TABLE jd_analytic_promo_deduction_latest ( sku_id bigint(20) NOT NULL, add_time datetime NOT NULL, -- title varchar(255) NOT NULL, price float NOT NULL, is_repeat tinyint(4) NOT NULL, reach float NOT NULL, deduction float NOT NULL, max_deduction float NOT NULL, dr_ratio float NOT NULL, maxp_ratio float NOT NULL, single_discount_rate float NOT NULL, category_id varchar(255) NOT NULL, category_name varchar(255) DEFAULT NULL, pid varchar(255) NOT NULL, code varchar(255) NOT NULL, name varchar(255) NOT NULL, content varchar(255) NOT NULL, adurl varchar(255) DEFAULT NULL, origin_time datetime NOT NULL, KEY skuid (sku_id) ) ENGINE=InnoDB DEFAULT CHARSET=utf8 ''' sql_cb_discount = ''' CREATE TABLE jd_analytic_promo_discount_latest ( sku_id bigint(20) NOT NULL, add_time datetime NOT NULL, -- title varchar(255) DEFAULT NULL, price float DEFAULT NULL, deduct_type smallint(6) DEFAULT NULL, reach_num smallint(6) DEFAULT NULL, discount float DEFAULT NULL, free_num smallint(6) DEFAULT NULL, rf_ratio float DEFAULT NULL, category_id varchar(255) DEFAULT NULL, category_name varchar(255) DEFAULT NULL, pid varchar(255) NOT NULL, code varchar(255) NOT NULL, name varchar(255) NOT NULL, content varchar(255) NOT NULL, adurl varchar(255) DEFAULT NULL, origin_dt datetime DEFAULT NULL, PRIMARY KEY (sku_id,pid), KEY skuid (sku_id) ) ENGINE=InnoDB DEFAULT CHARSET=utf8 ''' pret15 = crawler_helper.persist_db_history_and_lastest_empty_first( table_name='jd_analytic_promo_deduction', num_cols=len(vlist[0]), value_list=vlist, is_many=True, need_history=False, sql_create_table=sql_cb_deduction, ) logging.debug("code = 19, cases = %s" % num_19) logging.debug("rows to insert = %s" % len(vlist19)) pret19 = crawler_helper.persist_db_history_and_lastest_empty_first( table_name='jd_analytic_promo_discount', num_cols=len(vlist19[0]), value_list=vlist19, is_many=True, need_history=False, sql_create_table=sql_cb_discount, ) return _generate_mixed_ret([pret15, pret19])
def processItemPromo(): vlist = [] glist = [] update_date = timeHelper.getNowLong() recent = timeHelper.getTimeAheadOfNowHours( datamining_config.PROMO_ITEM_RECENCY_HOURS, timeHelper.FORMAT_LONG) logging.debug('Reading jd_promo_item_latest...') sql = ''' select sku_id, dt, promo_json from jd_promo_item_latest where promo_json is not NULL and LENGTH(promo_json)>100 and dt>="%s" ''' % recent retrows = dbhelper.executeSqlRead(sql, is_dirty=True) # total_rows = len(retrows) num_error = 0 num17 = 0 logging.debug('completed!') logging.debug("Total rows with promo_json: %s" % len(retrows)) for row in retrows: sku_id = row['sku_id'] dt = row['dt'] obj = None try: obj = json.loads(row['promo_json']) except: num_error += 1 continue rtags = obj['pickOneTag'] for tag in rtags: pid = tag['pid'] code = tag['code'] # 不记录加价购 if code == "17": num17 += 1 continue name = tag['name'] content = tag['content'] adurl = tag['adurl'] if 'adurl' in tag else "" tp = [sku_id, dt, pid, code, name, content, adurl, update_date] vlist.append(tp) tags = obj['tags'] for tag in tags: pid = tag['pid'] code = tag['code'] name = tag['name'] if 'name' in tag else "" if code == "10": # gift gifts = tag['gifts'] for gift in gifts: gift_name = "赠品" try: gift_name = gift['nm'] gift_num = gift['num'] if 'num' in gift else 1 gift_image = gift['mp'] if 'mp' in gift else "" gift_sku_id = gift['sid'] if 'sid' in gift else "" gift_gt = gift['gt'] if 'gt' in gift else "" gift_gs = gift['gs'] if 'gs' in gift else "" tp_gift = [ sku_id, dt, pid, code, name, gift_name, gift_num, gift_image, gift_sku_id, gift_gt, gift_gs, update_date ] glist.append(tp_gift) except Exception as e: logging.debug( "error in extracting gift info for sku_id = %s" % sku_id) logging.debug("%s" % e) else: content = tag['content'] adurl = tag['adurl'] if 'adurl' in tag else "" tp = [sku_id, dt, pid, code, name, content, adurl, update_date] vlist.append(tp) logging.error("IGNOR-ABLE: num of errors: %s (like json.loads error)" % num_error) logging.debug('num17: %s' % num17) logging.debug('vlist len: %s' % len(vlist)) logging.debug('glist len: %s' % len(glist)) sql_cb_promo_item = ''' CREATE TABLE jd_analytic_promo_item_latest ( sku_id bigint(20) NOT NULL, dt datetime NOT NULL, pid varchar(255) NOT NULL, code varchar(255) NOT NULL, name varchar(255) NOT NULL, content varchar(255) NOT NULL, adurl varchar(255) DEFAULT NULL, update_date datetime NOT NULL, PRIMARY KEY (sku_id,pid) ) ENGINE=InnoDB DEFAULT CHARSET=utf8 ''' sql_cb_promo_gift = ''' CREATE TABLE jd_analytic_promo_gift_latest ( sku_id bigint(20) NOT NULL, dt datetime NOT NULL, pid varchar(255) NOT NULL, code varchar(255) NOT NULL, name varchar(255) NOT NULL, gift_name varchar(255) NOT NULL, gift_num int(11) NOT NULL, gift_image varchar(255) DEFAULT NULL, gift_sku_id bigint(20) NOT NULL, gift_gt varchar(255) DEFAULT NULL, gift_gs varchar(255) DEFAULT NULL, update_date datetime NOT NULL, PRIMARY KEY (sku_id,pid) ) ENGINE=InnoDB DEFAULT CHARSET=utf8 ''' # persist in DB ret1 = ret2 = None if len(vlist) > 0: ret1 = crawler_helper.persist_db_history_and_lastest_empty_first( table_name='jd_analytic_promo_item', num_cols=len(vlist[0]), value_list=vlist, is_many=True, need_history=False, sql_create_table=sql_cb_promo_item, ) if len(glist) > 0: ret2 = crawler_helper.persist_db_history_and_lastest_empty_first( table_name='jd_analytic_promo_gift', num_cols=len(glist[0]), value_list=glist, is_many=True, need_history=False, sql_create_table=sql_cb_promo_gift, ) # record gift sglist = [] cur_time = timeHelper.getNowLong() for gg in glist: sglist.append([gg[0], cur_time]) sql_gg = 'insert ignore into jd_analytic_sku_gift values(%s,%s)' afr = dbhelper.executeSqlWriteMany(sql_gg, sglist) ret3 = { 'status': 0 if afr > 0 else -1, 'msg': "", } return _generate_mixed_ret([ret1, ret2, ret3])
def generate_category_property_mapping(): # sql = 'select * from jd_category' # retrows = dbhelper.executeSqlRead(sql) # # for row in retrows: # category_id = row['id'] # category_name = row['name'] black_list_clause = '","'.join(PROPERTY_KEY_BLACK_WORD_LIST) black_list_clause = '"%s"' % black_list_clause sql2 = ''' select sku_id, p_key, p_value, category_id from jd_item_property_latest a left join jd_item_category b using (sku_id) where LENGTH(p_value)>3 and LENGTH(p_value)<=30 and not (p_value like '%%个' and length(p_value)<=10) and p_key<>'__DEFAULT__' and LENGTH(p_key)>=6 and LENGTH(p_key)<=21 and p_key not like '%%重%%' and p_key not like '%%尺寸%%' and p_key not like '%%厚度%%' and p_key not like '%%宽度%%' and p_key not like '%%长宽高%%' and p_key not like '%%mm%%' and p_key <> '上架时间' and p_key NOT IN (%s) ''' % (black_list_clause) vlist = dbhelper.executeSqlRead2(sql2, is_dirty=True) sql_cb = ''' CREATE TABLE jd_analytic_property_latest ( sku_id bigint(20) DEFAULT NULL, p_key varchar(255) DEFAULT NULL, p_value varchar(255) DEFAULT NULL, category_id varchar(255) DEFAULT NULL, KEY skuid_categoryid (sku_id,category_id) ) ENGINE=InnoDB DEFAULT CHARSET=utf8 ''' print "now writing to db..." t1 = time.time() ret = crawler_helper.persist_db_history_and_lastest_empty_first( table_name='jd_analytic_property', num_cols=len(vlist[0]), value_list=vlist, is_many=True, need_history=False, sql_create_table=sql_cb, ) t2 = time.time() print "using : %0.0f" % (t2 - t1) return ret