def record_device_id(deviceid, platform, original_deviceid): tnow = timeHelper.getNowLong() if platform not in [notification_config.CONST_DEVICE_PLATFORM_ANDROID, notification_config.CONST_DEVICE_PLATFORM_IOS]: platform = notification_config.CONST_DEVICE_PLATFORM_IOS if len(deviceid) < 5: return { 'status': -1, 'msg': 'invalid device_id = %s' %deviceid, } # record in db vlist = [ [deviceid,platform,tnow,original_deviceid,""] ] ret = crawler_helper.persist_db_history_and_latest( table_name='user_notification_device', num_cols=len(vlist[0]), value_list=vlist, is_many=True, need_history=True, need_flow=False, ) return ret
def run_tasks_repeated(self): while True: has_error = False logging.info("\n\n\nJob start at %s" %timeHelper.getNowLong()) t1 = time.time() try: has_error = self.run_tasks_once() except Exception as e: logging.error("ERROR GREP LINE >>>>>>" * 5) logging.error(e) has_error = True logging.info("HAS ERROR TODAY? %s" %has_error) t2 = time.time() logging.info("RUN-ONCE TOTALLY TAKING TIME: %s seconds" %(t2-t1) ) logging.info("-" * 60) if self.is_daily: remaining = timeHelper.getTimeLeftTillTomorrow() + self.START_HOUR*3600 + 10 logging.info("Sleep to tomorrow, at %s:00:00 hour: hours left = %s" %(self.START_HOUR,remaining/3600) ) else: inter = self.interval_seconds - int(t2-t1) if inter < 0: inter = 0 remaining = inter + 10 #+ self.START_HOUR*3600 logging.info("Sleep to fill interval, seconds left = %s" %(remaining) ) if remaining < self.min_sleep: remaining = self.min_sleep time.sleep(remaining)
def fill_catalog(): tnow = timeHelper.getNowLong() catalog_id = 1000 vlist = [] tlist = [] for cdict in JD_CATALOG_ARRAY: ttp = [] catids = cdict['category_ids'] catalog_name = cdict['catalog_name'].strip() ttp += [catalog_id, catalog_name, catalog_id, tnow] for item in catids: vtp = [] cat_name = item.strip() id_prefix_array = _get_category_id_prefix_given_category_name( cat_name) vtp += id_prefix_array vtp += [catalog_id, catalog_name, tnow] vlist.append(vtp) tlist.append(ttp) catalog_id += 1000 sql1 = 'insert into jd_catalog values(%s,%s,%s,%s)' tlist += _get_fixed_catalog() ar1 = dbhelper.executeSqlWriteMany(sql1, tlist) sql2 = 'insert into jd_catalog_map values(%s,%s,%s,%s,%s)' ar2 = dbhelper.executeSqlWriteMany(sql2, vlist) print 'jd_catalog rows inserted: %s' % ar1 print 'jd_catalog_map rows inserted: %s' % ar2
def fill_catalog(): tnow = timeHelper.getNowLong() catalog_id = 1000 vlist = [] tlist = [] for cdict in JD_CATALOG_ARRAY: ttp = [] catids = cdict['category_ids'] catalog_name = cdict['catalog_name'].strip() ttp += [ catalog_id, catalog_name, catalog_id, tnow ] for item in catids: vtp = [] cat_name = item.strip() id_prefix_array = _get_category_id_prefix_given_category_name(cat_name) vtp += id_prefix_array vtp += [catalog_id, catalog_name, tnow] vlist.append(vtp) tlist.append(ttp) catalog_id += 1000 sql1 = 'insert into jd_catalog values(%s,%s,%s,%s)' tlist += _get_fixed_catalog() ar1 = dbhelper.executeSqlWriteMany(sql1, tlist) sql2 = 'insert into jd_catalog_map values(%s,%s,%s,%s,%s)' ar2 = dbhelper.executeSqlWriteMany(sql2, vlist) print 'jd_catalog rows inserted: %s' %ar1 print 'jd_catalog_map rows inserted: %s' %ar2
def record_device_id(deviceid, platform, original_deviceid): tnow = timeHelper.getNowLong() if platform not in [ notification_config.CONST_DEVICE_PLATFORM_ANDROID, notification_config.CONST_DEVICE_PLATFORM_IOS ]: platform = notification_config.CONST_DEVICE_PLATFORM_IOS if len(deviceid) < 5: return { 'status': -1, 'msg': 'invalid device_id = %s' % deviceid, } # record in db vlist = [[deviceid, platform, tnow, original_deviceid, ""]] ret = crawler_helper.persist_db_history_and_latest( table_name='user_notification_device', num_cols=len(vlist[0]), value_list=vlist, is_many=True, need_history=True, need_flow=False, ) return ret
def _get_fixed_catalog(): tnow = timeHelper.getNowLong() vlist = [ ['_ALL_', '全部折扣', 2000000, tnow], ['_EXPENSIVE_', '超值折扣', 1000000, tnow], ['HOT', '最新发现', 2500000, tnow], ['_HISTORY_LOWEST_', '历史最低', 3000000, tnow], ] return vlist
def _get_fixed_catalog(): tnow = timeHelper.getNowLong() vlist = [ ['_ALL_','全部折扣',2000000,tnow], ['_EXPENSIVE_','超值折扣',1000000,tnow], ['HOT','最新发现',2500000,tnow], ['_HISTORY_LOWEST_','历史最低',3000000,tnow], ] return vlist
def update_history_lowest_store(): skulist = getHistoryLowest_SkuIds() dt = timeHelper.getNowLong() vlist = [] for sku_id in skulist: vlist.append([sku_id, dt]) sql = 'insert ignore into jd_notification_history_lowest values (%s,%s)' afr = dbhelper.executeSqlWriteMany(sql,vlist) sql2 = 'replace into jd_notification_job_status values("%s","%s")'%(NOTIFICATION_JOB_NAME,dt) afr2 = dbhelper.executeSqlWrite1(sql2) afr3 = _removeOldNotifications() afr4 = _removeOutdated_Nonhistory_lowest() return [afr,afr2,afr3,afr4]
def _calculate_worthy_values(sku_info_list): for sku in sku_info_list: col_name_list = [ 'discount_rate', 'max_deduction_ratio', 'discount', 'rf_ratio', 'gift_ratio', ] param_dict = {} for item in col_name_list: param_dict[item] = 1.0 index = _get_column_index(item) if sku[index] is not None: param_dict[item] = float(sku[index]) if item == 'max_deduction_ratio': param_dict[item] = 1.0 - float(sku[index]) # The most easy algorithm to calculate final worthy score value1 = _calculate_weighted_score(param_dict, col_worthyvalue_weight_dict_1) sku[_get_column_index('worthy_value1')] = value1 value2 = _calculate_weighted_score( param_dict, col_worthyvalue_weight_dict_acitivity) sku[_get_column_index('activity_discount_rate')] = value2 value3 = _calculate_weighted_score( param_dict, col_worthyvalue_weight_dict_deduct_even) sku[_get_column_index('total_discount_rate')] = value3 sku[_get_column_index('final_price')] = caculate_final_price( sku, price=None) nowtime = timeHelper.getNowLong() sku[_get_column_index('this_update_time')] = nowtime # if value1<1: # print return 0
def do_log_user_event(device_id, query, catalog_id, remote_ip): catalog_name = "" if query is not None and len(query) > 0: catalog_id = "" else: query = "" sql2 = 'select * from jd_catalog where catalog_id="%s"' % catalog_id # print sql2 retrows = dbhelper.executeSqlRead(sql2) if len(retrows) > 0: catalog_name = retrows[0]['catalog_name'] else: catalog_name = 'Unknown' sql_user_event = 'insert into user_events values("%s","%s","%s","%s","%s","%s")' % ( device_id, query, catalog_id, catalog_name, timeHelper.getNowLong(), remote_ip) afr = dbhelper.executeSqlWrite1(sql_user_event) # print afr return afr
def run_tasks_repeated(self): while True: has_error = False logging.info("\n\n\nJob start at %s" % timeHelper.getNowLong()) t1 = time.time() try: has_error = self.run_tasks_once() except Exception as e: logging.error("ERROR GREP LINE >>>>>>" * 5) logging.error(e) has_error = True logging.info("HAS ERROR TODAY? %s" % has_error) t2 = time.time() logging.info("RUN-ONCE TOTALLY TAKING TIME: %s seconds" % (t2 - t1)) logging.info("-" * 60) if self.is_daily: remaining = timeHelper.getTimeLeftTillTomorrow( ) + self.START_HOUR * 3600 + 10 logging.info( "Sleep to tomorrow, at %s:00:00 hour: hours left = %s" % (self.START_HOUR, remaining / 3600)) else: inter = self.interval_seconds - int(t2 - t1) if inter < 0: inter = 0 remaining = inter + 10 #+ self.START_HOUR*3600 logging.info("Sleep to fill interval, seconds left = %s" % (remaining)) if remaining < self.min_sleep: remaining = self.min_sleep time.sleep(remaining)
def _calculate_worthy_values(sku_info_list): for sku in sku_info_list: col_name_list = [ 'discount_rate', 'max_deduction_ratio', 'discount', 'rf_ratio', 'gift_ratio', ] param_dict = {} for item in col_name_list: param_dict[item] = 1.0 index = _get_column_index(item) if sku[index] is not None: param_dict[item] = float(sku[index]) if item=='max_deduction_ratio': param_dict[item] = 1.0 - float(sku[index]) # The most easy algorithm to calculate final worthy score value1 = _calculate_weighted_score(param_dict, col_worthyvalue_weight_dict_1) sku[_get_column_index('worthy_value1')] = value1 value2 = _calculate_weighted_score(param_dict, col_worthyvalue_weight_dict_acitivity) sku[_get_column_index('activity_discount_rate')] = value2 value3 = _calculate_weighted_score(param_dict, col_worthyvalue_weight_dict_deduct_even) sku[_get_column_index('total_discount_rate')] = value3 sku[_get_column_index('final_price')] = caculate_final_price(sku,price=None) nowtime = timeHelper.getNowLong() sku[_get_column_index('this_update_time')] = nowtime # if value1<1: # print return 0
def do_log_user_event(device_id, query, catalog_id, remote_ip): catalog_name = "" if query is not None and len(query) > 0: catalog_id = "" else: query = "" sql2 = 'select * from jd_catalog where catalog_id="%s"' %catalog_id # print sql2 retrows = dbhelper.executeSqlRead(sql2) if len(retrows) > 0: catalog_name = retrows[0]['catalog_name'] else: catalog_name = 'Unknown' sql_user_event = 'insert into user_events values("%s","%s","%s","%s","%s","%s")' %(device_id,query,catalog_id,catalog_name, timeHelper.getNowLong(),remote_ip) afr = dbhelper.executeSqlWrite1(sql_user_event) # print afr return afr
def processItemPromo(): vlist = [] glist = [] update_date = timeHelper.getNowLong() recent = timeHelper.getTimeAheadOfNowHours( datamining_config.PROMO_ITEM_RECENCY_HOURS, timeHelper.FORMAT_LONG) logging.debug('Reading jd_promo_item_latest...') sql = ''' select sku_id, dt, promo_json from jd_promo_item_latest where promo_json is not NULL and LENGTH(promo_json)>100 and dt>="%s" ''' % recent retrows = dbhelper.executeSqlRead(sql, is_dirty=True) # total_rows = len(retrows) num_error = 0 num17 = 0 logging.debug('completed!') logging.debug("Total rows with promo_json: %s" % len(retrows)) for row in retrows: sku_id = row['sku_id'] dt = row['dt'] obj = None try: obj = json.loads(row['promo_json']) except: num_error += 1 continue rtags = obj['pickOneTag'] for tag in rtags: pid = tag['pid'] code = tag['code'] # 不记录加价购 if code == "17": num17 += 1 continue name = tag['name'] content = tag['content'] adurl = tag['adurl'] if 'adurl' in tag else "" tp = [sku_id, dt, pid, code, name, content, adurl, update_date] vlist.append(tp) tags = obj['tags'] for tag in tags: pid = tag['pid'] code = tag['code'] name = tag['name'] if 'name' in tag else "" if code == "10": # gift gifts = tag['gifts'] for gift in gifts: gift_name = "赠品" try: gift_name = gift['nm'] gift_num = gift['num'] if 'num' in gift else 1 gift_image = gift['mp'] if 'mp' in gift else "" gift_sku_id = gift['sid'] if 'sid' in gift else "" gift_gt = gift['gt'] if 'gt' in gift else "" gift_gs = gift['gs'] if 'gs' in gift else "" tp_gift = [ sku_id, dt, pid, code, name, gift_name, gift_num, gift_image, gift_sku_id, gift_gt, gift_gs, update_date ] glist.append(tp_gift) except Exception as e: logging.debug( "error in extracting gift info for sku_id = %s" % sku_id) logging.debug("%s" % e) else: content = tag['content'] adurl = tag['adurl'] if 'adurl' in tag else "" tp = [sku_id, dt, pid, code, name, content, adurl, update_date] vlist.append(tp) logging.error("IGNOR-ABLE: num of errors: %s (like json.loads error)" % num_error) logging.debug('num17: %s' % num17) logging.debug('vlist len: %s' % len(vlist)) logging.debug('glist len: %s' % len(glist)) sql_cb_promo_item = ''' CREATE TABLE jd_analytic_promo_item_latest ( sku_id bigint(20) NOT NULL, dt datetime NOT NULL, pid varchar(255) NOT NULL, code varchar(255) NOT NULL, name varchar(255) NOT NULL, content varchar(255) NOT NULL, adurl varchar(255) DEFAULT NULL, update_date datetime NOT NULL, PRIMARY KEY (sku_id,pid) ) ENGINE=InnoDB DEFAULT CHARSET=utf8 ''' sql_cb_promo_gift = ''' CREATE TABLE jd_analytic_promo_gift_latest ( sku_id bigint(20) NOT NULL, dt datetime NOT NULL, pid varchar(255) NOT NULL, code varchar(255) NOT NULL, name varchar(255) NOT NULL, gift_name varchar(255) NOT NULL, gift_num int(11) NOT NULL, gift_image varchar(255) DEFAULT NULL, gift_sku_id bigint(20) NOT NULL, gift_gt varchar(255) DEFAULT NULL, gift_gs varchar(255) DEFAULT NULL, update_date datetime NOT NULL, PRIMARY KEY (sku_id,pid) ) ENGINE=InnoDB DEFAULT CHARSET=utf8 ''' # persist in DB ret1 = ret2 = None if len(vlist) > 0: ret1 = crawler_helper.persist_db_history_and_lastest_empty_first( table_name='jd_analytic_promo_item', num_cols=len(vlist[0]), value_list=vlist, is_many=True, need_history=False, sql_create_table=sql_cb_promo_item, ) if len(glist) > 0: ret2 = crawler_helper.persist_db_history_and_lastest_empty_first( table_name='jd_analytic_promo_gift', num_cols=len(glist[0]), value_list=glist, is_many=True, need_history=False, sql_create_table=sql_cb_promo_gift, ) # record gift sglist = [] cur_time = timeHelper.getNowLong() for gg in glist: sglist.append([gg[0], cur_time]) sql_gg = 'insert ignore into jd_analytic_sku_gift values(%s,%s)' afr = dbhelper.executeSqlWriteMany(sql_gg, sglist) ret3 = { 'status': 0 if afr > 0 else -1, 'msg': "", } return _generate_mixed_ret([ret1, ret2, ret3])
def process_promo_detail(): today = timeHelper.getTimeAheadOfNowHours(datamining_config.PROMO_ITEM_RECENCY_HOURS,'%Y-%m-%d %H:%M:%S') # today = timeHelper.getTimeAheadOfNowDays(1) sql = ''' select a.*, b.price, d.id as category_id, d.name as category_name from jd_analytic_promo_item_latest a left join jd_item_price_latest b on a.sku_id = b.sku_id left JOIN jd_item_category c on a.sku_id = c.sku_id left join jd_category d on c.category_id = d.id where a.dt >= "%s" and b.sku_id is not NULL and b.price is not NULL ''' %today # logging.debug(sql) retrows = dbhelper.executeSqlRead(sql, is_dirty=True) vlist = [] vlist19 = [] dt = timeHelper.getNowLong() logging.debug('num total promo_item rows: %s' %len(retrows) ) # exit() num_15 = 0 num_19 = 0 num_15_repeated = 0 for row in retrows: sku_id = row['sku_id'] code = int(row['code']) content = row['content'] if 'content' in row else "" adurl = row['adurl'] if 'adurl' in row else "" origin_dt = row['dt'] pid = row['pid'] name = row['name'] if 'name' in row else "" price = float("%s" %row['price']) category_id = row['category_id'] category_name = row['category_name'] # title = row['title'] if code == 15: num_15 += 1 ret = _extract_reach_deduction_array(content) stat_has_repeat = False max_deduction = float(ret['max']) for item in ret['data']: try: reach = float(item[0]) deduction = float(item[1]) is_repeat = item[2] if is_repeat==1: stat_has_repeat = True dr_ratio = deduction*1.0/reach maxp_ratio = max_deduction*1.0/price if max_deduction > 0 else 1.0 could_deduct = 0 except Exception as e: logging.error("reach:%s, deduction:%s" %(reach,deduction) ) logging.error(e) continue if price >= reach and reach>0: if is_repeat: times = price // reach else: times = 1 could_deduct = times * deduction if could_deduct > max_deduction: could_deduct = max_deduction single_discount_rate = could_deduct/price tp =[sku_id, dt, price, is_repeat, reach, deduction, max_deduction, dr_ratio, maxp_ratio, single_discount_rate, category_id, category_name, pid, code, name, content, adurl, origin_dt] vlist.append(tp) if stat_has_repeat: num_15_repeated += 1 elif code == 19: sku_str = "%s" %sku_id num_19 += 1 # 满几件打折或者降低多少 type_word_list = ["总价打","商品价格"] # 0: 直接打折 # 1: 减商品价格 # 2: 其他 deduct_type = 0 for type_word in type_word_list: if content.find(type_word) >= 0: # deduct_type = 0 break deduct_type += 1 if deduct_type==2: logging.error("NEW TYPE OF DISCOUNT FOUND!!!") logging.error(content) logging.error("NEW TYPE OF DISCOUNT FOUND!!!") pt = re.compile(u'[\d.]+',re.U) pts = pt.findall(content) if len(pts) != 2: if '可购买热销商品' not in content: logging.error(content) logging.error("NEW PATTERN ABOVE") reach_num = discount = free_num = rf_ratio = None reach_num = float(pts[0]) if deduct_type==0: discount = pts[1] elif deduct_type==1: free_num = float(pts[1]) rf_ratio = float(free_num*1.0/reach_num) # tp19 =[sku_id, dt, title, price, deduct_type, reach_num, discount, free_num, rf_ratio, category_id, category_name, pid, code, name, content, adurl, origin_dt] tp19 =[sku_id, dt, price, deduct_type, reach_num, discount, free_num, rf_ratio, category_id, category_name, pid, code, name, content, adurl, origin_dt] vlist19.append(tp19) else: pass logging.debug("code = 15, cases = %s" %num_15) logging.debug("code = 15, repeated = %s" %num_15_repeated) logging.debug("rows to insert = %s" %len(vlist) ) sql_cb_deduction = ''' CREATE TABLE jd_analytic_promo_deduction_latest ( sku_id bigint(20) NOT NULL, add_time datetime NOT NULL, -- title varchar(255) NOT NULL, price float NOT NULL, is_repeat tinyint(4) NOT NULL, reach float NOT NULL, deduction float NOT NULL, max_deduction float NOT NULL, dr_ratio float NOT NULL, maxp_ratio float NOT NULL, single_discount_rate float NOT NULL, category_id varchar(255) NOT NULL, category_name varchar(255) DEFAULT NULL, pid varchar(255) NOT NULL, code varchar(255) NOT NULL, name varchar(255) NOT NULL, content varchar(255) NOT NULL, adurl varchar(255) DEFAULT NULL, origin_time datetime NOT NULL, KEY skuid (sku_id) ) ENGINE=InnoDB DEFAULT CHARSET=utf8 ''' sql_cb_discount = ''' CREATE TABLE jd_analytic_promo_discount_latest ( sku_id bigint(20) NOT NULL, add_time datetime NOT NULL, -- title varchar(255) DEFAULT NULL, price float DEFAULT NULL, deduct_type smallint(6) DEFAULT NULL, reach_num smallint(6) DEFAULT NULL, discount float DEFAULT NULL, free_num smallint(6) DEFAULT NULL, rf_ratio float DEFAULT NULL, category_id varchar(255) DEFAULT NULL, category_name varchar(255) DEFAULT NULL, pid varchar(255) NOT NULL, code varchar(255) NOT NULL, name varchar(255) NOT NULL, content varchar(255) NOT NULL, adurl varchar(255) DEFAULT NULL, origin_dt datetime DEFAULT NULL, PRIMARY KEY (sku_id,pid), KEY skuid (sku_id) ) ENGINE=InnoDB DEFAULT CHARSET=utf8 ''' pret15 = crawler_helper.persist_db_history_and_lastest_empty_first( table_name='jd_analytic_promo_deduction', num_cols=len(vlist[0]), value_list=vlist, is_many=True, need_history=False, sql_create_table=sql_cb_deduction, ) logging.debug("code = 19, cases = %s" %num_19 ) logging.debug("rows to insert = %s" %len(vlist19) ) pret19 = crawler_helper.persist_db_history_and_lastest_empty_first( table_name='jd_analytic_promo_discount', num_cols=len(vlist19[0]), value_list=vlist19, is_many=True, need_history=False, sql_create_table=sql_cb_discount, ) return _generate_mixed_ret([pret15, pret19])
def processItemPromo(): vlist = [] glist = [] update_date = timeHelper.getNowLong() recent = timeHelper.getTimeAheadOfNowHours(datamining_config.PROMO_ITEM_RECENCY_HOURS,timeHelper.FORMAT_LONG) logging.debug('Reading jd_promo_item_latest...' ) sql = ''' select sku_id, dt, promo_json from jd_promo_item_latest where promo_json is not NULL and LENGTH(promo_json)>100 and dt>="%s" ''' %recent retrows = dbhelper.executeSqlRead(sql,is_dirty=True) # total_rows = len(retrows) num_error = 0 num17 = 0 logging.debug('completed!') logging.debug("Total rows with promo_json: %s" %len(retrows)) for row in retrows: sku_id = row['sku_id'] dt = row['dt'] obj = None try: obj = json.loads(row['promo_json']) except: num_error += 1 continue rtags = obj['pickOneTag'] for tag in rtags: pid = tag['pid'] code = tag['code'] # 不记录加价购 if code == "17": num17 += 1 continue name = tag['name'] content = tag['content'] adurl = tag['adurl'] if 'adurl' in tag else "" tp = [sku_id, dt, pid, code, name, content, adurl, update_date] vlist.append(tp) tags = obj['tags'] for tag in tags: pid = tag['pid'] code = tag['code'] name = tag['name'] if 'name' in tag else "" if code == "10": # gift gifts = tag['gifts'] for gift in gifts: gift_name = "赠品" try: gift_name = gift['nm'] gift_num = gift['num'] if 'num' in gift else 1 gift_image = gift['mp'] if 'mp' in gift else "" gift_sku_id = gift['sid'] if 'sid' in gift else "" gift_gt = gift['gt'] if 'gt' in gift else "" gift_gs = gift['gs'] if 'gs' in gift else "" tp_gift = [sku_id,dt,pid,code, name, gift_name, gift_num, gift_image, gift_sku_id, gift_gt, gift_gs, update_date] glist.append(tp_gift) except Exception as e: logging.debug("error in extracting gift info for sku_id = %s"%sku_id) logging.debug("%s" %e) else: content = tag['content'] adurl = tag['adurl'] if 'adurl' in tag else "" tp = [sku_id, dt, pid, code, name, content, adurl, update_date] vlist.append(tp) logging.error("IGNOR-ABLE: num of errors: %s (like json.loads error)" %num_error) logging.debug('num17: %s' %num17 ) logging.debug('vlist len: %s' %len(vlist) ) logging.debug('glist len: %s' %len(glist) ) sql_cb_promo_item = ''' CREATE TABLE jd_analytic_promo_item_latest ( sku_id bigint(20) NOT NULL, dt datetime NOT NULL, pid varchar(255) NOT NULL, code varchar(255) NOT NULL, name varchar(255) NOT NULL, content varchar(255) NOT NULL, adurl varchar(255) DEFAULT NULL, update_date datetime NOT NULL, PRIMARY KEY (sku_id,pid) ) ENGINE=InnoDB DEFAULT CHARSET=utf8 ''' sql_cb_promo_gift = ''' CREATE TABLE jd_analytic_promo_gift_latest ( sku_id bigint(20) NOT NULL, dt datetime NOT NULL, pid varchar(255) NOT NULL, code varchar(255) NOT NULL, name varchar(255) NOT NULL, gift_name varchar(255) NOT NULL, gift_num int(11) NOT NULL, gift_image varchar(255) DEFAULT NULL, gift_sku_id bigint(20) NOT NULL, gift_gt varchar(255) DEFAULT NULL, gift_gs varchar(255) DEFAULT NULL, update_date datetime NOT NULL, PRIMARY KEY (sku_id,pid) ) ENGINE=InnoDB DEFAULT CHARSET=utf8 ''' # persist in DB ret1 = ret2 = None if len(vlist)>0: ret1 = crawler_helper.persist_db_history_and_lastest_empty_first( table_name='jd_analytic_promo_item', num_cols=len(vlist[0]), value_list=vlist, is_many=True, need_history=False, sql_create_table=sql_cb_promo_item, ) if len(glist)>0: ret2 = crawler_helper.persist_db_history_and_lastest_empty_first( table_name='jd_analytic_promo_gift', num_cols=len(glist[0]), value_list=glist, is_many=True, need_history=False, sql_create_table=sql_cb_promo_gift, ) # record gift sglist = [] cur_time = timeHelper.getNowLong() for gg in glist: sglist.append([gg[0],cur_time]) sql_gg = 'insert ignore into jd_analytic_sku_gift values(%s,%s)' afr = dbhelper.executeSqlWriteMany(sql_gg,sglist) ret3 = { 'status': 0 if afr > 0 else -1, 'msg': "", } return _generate_mixed_ret([ret1,ret2, ret3])
def process_promo_detail(): today = timeHelper.getTimeAheadOfNowHours( datamining_config.PROMO_ITEM_RECENCY_HOURS, '%Y-%m-%d %H:%M:%S') # today = timeHelper.getTimeAheadOfNowDays(1) sql = ''' select a.*, b.price, d.id as category_id, d.name as category_name from jd_analytic_promo_item_latest a left join jd_item_price_latest b on a.sku_id = b.sku_id left JOIN jd_item_category c on a.sku_id = c.sku_id left join jd_category d on c.category_id = d.id where a.dt >= "%s" and b.sku_id is not NULL and b.price is not NULL ''' % today # logging.debug(sql) retrows = dbhelper.executeSqlRead(sql, is_dirty=True) vlist = [] vlist19 = [] dt = timeHelper.getNowLong() logging.debug('num total promo_item rows: %s' % len(retrows)) # exit() num_15 = 0 num_19 = 0 num_15_repeated = 0 for row in retrows: sku_id = row['sku_id'] code = int(row['code']) content = row['content'] if 'content' in row else "" adurl = row['adurl'] if 'adurl' in row else "" origin_dt = row['dt'] pid = row['pid'] name = row['name'] if 'name' in row else "" price = float("%s" % row['price']) category_id = row['category_id'] category_name = row['category_name'] # title = row['title'] if code == 15: num_15 += 1 ret = _extract_reach_deduction_array(content) stat_has_repeat = False max_deduction = float(ret['max']) for item in ret['data']: try: reach = float(item[0]) deduction = float(item[1]) is_repeat = item[2] if is_repeat == 1: stat_has_repeat = True dr_ratio = deduction * 1.0 / reach maxp_ratio = max_deduction * 1.0 / price if max_deduction > 0 else 1.0 could_deduct = 0 except Exception as e: logging.error("reach:%s, deduction:%s" % (reach, deduction)) logging.error(e) continue if price >= reach and reach > 0: if is_repeat: times = price // reach else: times = 1 could_deduct = times * deduction if could_deduct > max_deduction: could_deduct = max_deduction single_discount_rate = could_deduct / price tp = [ sku_id, dt, price, is_repeat, reach, deduction, max_deduction, dr_ratio, maxp_ratio, single_discount_rate, category_id, category_name, pid, code, name, content, adurl, origin_dt ] vlist.append(tp) if stat_has_repeat: num_15_repeated += 1 elif code == 19: sku_str = "%s" % sku_id num_19 += 1 # 满几件打折或者降低多少 type_word_list = ["总价打", "商品价格"] # 0: 直接打折 # 1: 减商品价格 # 2: 其他 deduct_type = 0 for type_word in type_word_list: if content.find(type_word) >= 0: # deduct_type = 0 break deduct_type += 1 if deduct_type == 2: logging.error("NEW TYPE OF DISCOUNT FOUND!!!") logging.error(content) logging.error("NEW TYPE OF DISCOUNT FOUND!!!") pt = re.compile(u'[\d.]+', re.U) pts = pt.findall(content) if len(pts) != 2: if '可购买热销商品' not in content: logging.error(content) logging.error("NEW PATTERN ABOVE") reach_num = discount = free_num = rf_ratio = None reach_num = float(pts[0]) if deduct_type == 0: discount = pts[1] elif deduct_type == 1: free_num = float(pts[1]) rf_ratio = float(free_num * 1.0 / reach_num) # tp19 =[sku_id, dt, title, price, deduct_type, reach_num, discount, free_num, rf_ratio, category_id, category_name, pid, code, name, content, adurl, origin_dt] tp19 = [ sku_id, dt, price, deduct_type, reach_num, discount, free_num, rf_ratio, category_id, category_name, pid, code, name, content, adurl, origin_dt ] vlist19.append(tp19) else: pass logging.debug("code = 15, cases = %s" % num_15) logging.debug("code = 15, repeated = %s" % num_15_repeated) logging.debug("rows to insert = %s" % len(vlist)) sql_cb_deduction = ''' CREATE TABLE jd_analytic_promo_deduction_latest ( sku_id bigint(20) NOT NULL, add_time datetime NOT NULL, -- title varchar(255) NOT NULL, price float NOT NULL, is_repeat tinyint(4) NOT NULL, reach float NOT NULL, deduction float NOT NULL, max_deduction float NOT NULL, dr_ratio float NOT NULL, maxp_ratio float NOT NULL, single_discount_rate float NOT NULL, category_id varchar(255) NOT NULL, category_name varchar(255) DEFAULT NULL, pid varchar(255) NOT NULL, code varchar(255) NOT NULL, name varchar(255) NOT NULL, content varchar(255) NOT NULL, adurl varchar(255) DEFAULT NULL, origin_time datetime NOT NULL, KEY skuid (sku_id) ) ENGINE=InnoDB DEFAULT CHARSET=utf8 ''' sql_cb_discount = ''' CREATE TABLE jd_analytic_promo_discount_latest ( sku_id bigint(20) NOT NULL, add_time datetime NOT NULL, -- title varchar(255) DEFAULT NULL, price float DEFAULT NULL, deduct_type smallint(6) DEFAULT NULL, reach_num smallint(6) DEFAULT NULL, discount float DEFAULT NULL, free_num smallint(6) DEFAULT NULL, rf_ratio float DEFAULT NULL, category_id varchar(255) DEFAULT NULL, category_name varchar(255) DEFAULT NULL, pid varchar(255) NOT NULL, code varchar(255) NOT NULL, name varchar(255) NOT NULL, content varchar(255) NOT NULL, adurl varchar(255) DEFAULT NULL, origin_dt datetime DEFAULT NULL, PRIMARY KEY (sku_id,pid), KEY skuid (sku_id) ) ENGINE=InnoDB DEFAULT CHARSET=utf8 ''' pret15 = crawler_helper.persist_db_history_and_lastest_empty_first( table_name='jd_analytic_promo_deduction', num_cols=len(vlist[0]), value_list=vlist, is_many=True, need_history=False, sql_create_table=sql_cb_deduction, ) logging.debug("code = 19, cases = %s" % num_19) logging.debug("rows to insert = %s" % len(vlist19)) pret19 = crawler_helper.persist_db_history_and_lastest_empty_first( table_name='jd_analytic_promo_discount', num_cols=len(vlist19[0]), value_list=vlist19, is_many=True, need_history=False, sql_create_table=sql_cb_discount, ) return _generate_mixed_ret([pret15, pret19])
(select * from jd_item_comment_count_latest where CommentCount>=100) a left join jd_item_category b on a.SkuId = b.sku_id -- where a.dt > '2015-10-1' group by b.category_id ) e left JOIN jd_category c on e.category_id = c.id ''' %(today) return sql if __name__ == "__main__": while True: print 'job start: %s' %timeHelper.getNowLong() t1 = time.time() ret = calculate_base_rating_for_categories() print ret t2 = time.time() print "Finished in seconds: %s" %(t2-t1) remaining = timeHelper.getTimeLeftTillTomorrow() print "now sleeping for hours: %s" %(remaining/3600) time.sleep(remaining)