def record_device_id(deviceid, platform, original_deviceid): tnow = timeHelper.getNowLong() if platform not in [notification_config.CONST_DEVICE_PLATFORM_ANDROID, notification_config.CONST_DEVICE_PLATFORM_IOS]: platform = notification_config.CONST_DEVICE_PLATFORM_IOS if len(deviceid) < 5: return { 'status': -1, 'msg': 'invalid device_id = %s' %deviceid, } # record in db vlist = [ [deviceid,platform,tnow,original_deviceid,""] ] ret = crawler_helper.persist_db_history_and_latest( table_name='user_notification_device', num_cols=len(vlist[0]), value_list=vlist, is_many=True, need_history=True, need_flow=False, ) return ret
def record_device_id(deviceid, platform, original_deviceid): tnow = timeHelper.getNowLong() if platform not in [ notification_config.CONST_DEVICE_PLATFORM_ANDROID, notification_config.CONST_DEVICE_PLATFORM_IOS ]: platform = notification_config.CONST_DEVICE_PLATFORM_IOS if len(deviceid) < 5: return { 'status': -1, 'msg': 'invalid device_id = %s' % deviceid, } # record in db vlist = [[deviceid, platform, tnow, original_deviceid, ""]] ret = crawler_helper.persist_db_history_and_latest( table_name='user_notification_device', num_cols=len(vlist[0]), value_list=vlist, is_many=True, need_history=True, need_flow=False, ) return ret
def match_discounts(): debug_sku_str = "" if not IF_DEBUG_SKU else " where sku_id=%s" %DEBUG_SKU_ID print('>>> 1/8 >>> Reading jd_price_temp_latest...') sql_price = 'select * from jd_price_temp_latest %s' %debug_sku_str retrows_price = dbhelper.executeSqlRead(sql_price, is_dirty=True) print('rows read: %s' %len(retrows_price)) print('>>> 2/8 >>> Reading strongest deductions of each sku...') deduction_dict = _get_deduction_dict() print('rows read: %s' %len(deduction_dict)) print('>>> 3/8 >>> Reading discounts of each sku...') discount_dict = _get_discount_dict() print('rows read: %s' %len(discount_dict)) print('>>> 4/8 >>> Reading gifts of each sku...') gift_dict = _get_gift_dict() print('rows read: %s' %len(gift_dict)) print('>>> 5/8 >>> Reading first seen date of each sku...') first_seen_dict = _get_item_firstseen_dict() print('rows read: %s' %len(first_seen_dict)) print('>>> 6/8 >>> Reading ratings of each sku...') rating_dict = _get_rating_dict() print('rows read: %s' %len(rating_dict)) print('>>> 7/8 >>> Joining results in memory...') _merge_dict_under_key( deduction_dict, [ discount_dict, gift_dict, first_seen_dict, rating_dict, ] ) tlist = _memory_left_join(retrows_price,deduction_dict, col_name_list_left=cols_left, col_name_list_right=cols_deduction ) print('rows generated: %s' %len(tlist)) print '>>> 8/8 >>> Calculating worhty_values...' _calculate_worthy_values(tlist) print 'num cols = %s ' %len(tlist[0]) print '>>> 9/9 >>> Saving to DB...' ret = crawler_helper.persist_db_history_and_latest( table_name='jd_worthy', num_cols=len(tlist[0]), value_list=tlist, is_many=True, need_history=False ) return ret
def match_discounts(): debug_sku_str = "" if not IF_DEBUG_SKU else " where sku_id=%s" % DEBUG_SKU_ID print('>>> 1/8 >>> Reading jd_price_temp_latest...') sql_price = 'select * from jd_price_temp_latest %s' % debug_sku_str retrows_price = dbhelper.executeSqlRead(sql_price, is_dirty=True) print('rows read: %s' % len(retrows_price)) print('>>> 2/8 >>> Reading strongest deductions of each sku...') deduction_dict = _get_deduction_dict() print('rows read: %s' % len(deduction_dict)) print('>>> 3/8 >>> Reading discounts of each sku...') discount_dict = _get_discount_dict() print('rows read: %s' % len(discount_dict)) print('>>> 4/8 >>> Reading gifts of each sku...') gift_dict = _get_gift_dict() print('rows read: %s' % len(gift_dict)) print('>>> 5/8 >>> Reading first seen date of each sku...') first_seen_dict = _get_item_firstseen_dict() print('rows read: %s' % len(first_seen_dict)) print('>>> 6/8 >>> Reading ratings of each sku...') rating_dict = _get_rating_dict() print('rows read: %s' % len(rating_dict)) print('>>> 7/8 >>> Joining results in memory...') _merge_dict_under_key(deduction_dict, [ discount_dict, gift_dict, first_seen_dict, rating_dict, ]) tlist = _memory_left_join(retrows_price, deduction_dict, col_name_list_left=cols_left, col_name_list_right=cols_deduction) print('rows generated: %s' % len(tlist)) print '>>> 8/8 >>> Calculating worhty_values...' _calculate_worthy_values(tlist) print 'num cols = %s ' % len(tlist[0]) print '>>> 9/9 >>> Saving to DB...' ret = crawler_helper.persist_db_history_and_latest(table_name='jd_worthy', num_cols=len(tlist[0]), value_list=tlist, is_many=True, need_history=False) return ret
def genPropertyTable(): print("reading...") sql = 'select * from jd_item_property_latest' retrows = dbhelper.executeSqlRead(sql, is_dirty=True) pdict = {} for row in retrows: p_key = row['p_key'] if p_key is None: continue if p_key == '__DEFAULT__' or p_key == u'__DEFAULT__': continue if len(p_key) > 60: # print p_key continue p_value = row['p_value'] if (p_value is None): continue if p_value == u'无': # print p_value continue p_value_nf = multi_replace(p_value, PROPERTY_SPLITTER_LIST, ' ') lendiff = len(p_value) - len(p_value_nf) if lendiff > 5: # print p_value continue sku_id = row['sku_id'] if sku_id in pdict: pold = pdict[sku_id] pdict[sku_id] = "%s %s" % (pold, p_value_nf) else: pdict[sku_id] = p_value_nf vlist = [] for key in pdict: vlist.append([key, pdict[key]]) print("writing to db...") return crawler_helper.persist_db_history_and_latest( table_name='jd_index_property', num_cols=len(vlist[0]), value_list=vlist, is_many=True, need_history=False, need_flow=False, )
def genPropertyTable(): print("reading...") sql = 'select * from jd_item_property_latest' retrows = dbhelper.executeSqlRead(sql, is_dirty=True) pdict = {} for row in retrows: p_key = row['p_key'] if p_key is None: continue if p_key == '__DEFAULT__' or p_key == u'__DEFAULT__': continue if len(p_key) > 60: # print p_key continue p_value = row['p_value'] if (p_value is None): continue if p_value == u'无': # print p_value continue p_value_nf = multi_replace(p_value,PROPERTY_SPLITTER_LIST,' ') lendiff = len(p_value) - len(p_value_nf) if lendiff > 5: # print p_value continue sku_id = row['sku_id'] if sku_id in pdict: pold = pdict[sku_id] pdict[sku_id] = "%s %s" %(pold,p_value_nf) else: pdict[sku_id] = p_value_nf vlist = [] for key in pdict: vlist.append([key,pdict[key]]) print("writing to db...") return crawler_helper.persist_db_history_and_latest( table_name='jd_index_property', num_cols=len(vlist[0]), value_list=vlist, is_many=True, need_history=False, need_flow=False, )
def calculate_base_rating_for_categories(): today = timeHelper.getNow() sql = getSqlCatRating() retrows = dbhelper.executeSqlRead(sql, is_dirty=True) # print sql print "rows of data selected for insert: %s" %len(retrows) # print len(retrows[0]) # print retrows[0] vlist = [] for row in retrows: tp = [] tp.append(row['category_id']) tp.append(row['sample_count']) tp.append(row['sum_1']) tp.append(row['sum_2']) tp.append(row['sum_3']) tp.append(row['sum_4']) tp.append(row['sum_5']) tp.append(row['comment_count']) tp.append(row['rating_score']) tp.append(row['rate_1']) tp.append(row['rate_2']) tp.append(row['rate_3']) tp.append(row['rate_4']) tp.append(row['rate_5']) tp.append(row['rate_good']) tp.append(row['rate_bad']) tp.append(row['origin_dt']) tp.append(row['dt']) tp.append(row['name']) # print row['category_id'] vlist.append(tp) return crawler_helper.persist_db_history_and_latest( table_name='jd_analytic_category_rating', num_cols=len(vlist[0]), value_list=vlist, is_many=True )
def generate_worthy_mix_main(): t1 = time.time() logging.debug( "1/4 >>> Join all related tables: price_temp, dynamic, deduction, discount, gift, rating, last-seen, etc..." ) worthy_rows = _get_merged_tables() t2 = time.time() logging.debug("Done, rows read: %s, using seconds: %s\n" % (len(worthy_rows), (t2 - t1))) logging.debug("2/4 >>> Calculating worthy scores and final price") _calculate_worthy_values(worthy_rows) t3 = time.time() logging.debug("Done, using seconds: %0.1f\n" % (t3 - t2)) logging.debug("3/4 >>> Generating data for db insert") insert_list = rows_helper.generate_list_for_db_write(worthy_rows, worthy_columns) t4 = time.time() logging.debug("Done, using seconds: %0.1f\n" % (t4 - t3)) logging.debug("4/4 >>> Now writing to db, rows = %s" % len(insert_list)) tbl_name = "jd_worthy" # tbl_name = 'zz_worthy_%s' %int(time.time()) # tbl_name_latest = "%s_latest" %tbl_name # ret = crawler_helper.persist_db_history_and_lastest_empty_first( WRITE_STEP = 10000 wtimes = len(insert_list) // WRITE_STEP wremaining = len(insert_list) - wtimes * WRITE_STEP print "%s,%s,%s" % (len(insert_list), wtimes, wremaining) total_written = 0 for i in xrange(wtimes): tt1 = time.time() partlist = insert_list[i * WRITE_STEP : (i + 1) * WRITE_STEP] ret = crawler_helper.persist_db_history_and_latest( table_name=tbl_name, num_cols=len(insert_list[0]), value_list=partlist, is_many=True, need_history=False ) afr = ret["affected_rows_latest"] total_written += afr tt2 = time.time() logging.debug("Written %s/%s: affected rows = %s, using seconds: %s" % (i, wtimes + 1, afr, int(tt2 - tt1))) partlist = insert_list[wtimes * WRITE_STEP : len(insert_list)] ret = crawler_helper.persist_db_history_and_latest( table_name=tbl_name, num_cols=len(insert_list[0]), value_list=partlist, is_many=True, need_history=False ) afr = ret["affected_rows_latest"] total_written += afr logging.debug("Written %s/%s: affected rows = %s" % (wtimes + 1, wtimes + 1, afr)) logging.debug(">>> Done, total affected rows = %s >>>" % total_written) # logging.debug('Now altering table name...') # afr = dbhelper.rename_table(tbl_name_latest, 'jd_worthy_latest', if_delete_duplicate=True) t5 = time.time() logging.debug("Done, using seconds: %0.1f\n" % (t5 - t4)) return ret