def cache_dataset(self, sc, cache_conf): broadcast_enabled = cache_conf.get('broadcast.enabled', False) if isinstance(broadcast_enabled, bool) and broadcast_enabled: # cache_id = 'ds_id_' + cache_conf['source.id'] + '#cache_id_' + cache_conf['cache.id'] cache_id = 'ds_id_' + str(cache_conf['source.id']) + "#cache_id_" + str(cache_conf['cache.id']) if cache_id in self.cache_pools: print('= = ' * 10, '[myapp CacheManager.cache_dataset] found dataset has been cached') return self.cache_pools[cache_id] else: host = cache_conf['host'] port = cache_conf.get('port', 3306) db = cache_conf['db'] user = cache_conf['user'] password = cache_conf.get('password', '') table_name = cache_conf['tableName'] key_name = cache_conf['keyName'] cache_key_name_list = cache_conf['cache.keyName.list'] cache_sql = 'select ' + key_name + ', ' + cache_key_name_list + ' from ' + db + '.' + table_name conn = MySQLUtils.get_connection(host=host, db=db, user=user, password=password, port=port) external_cache = MySQLUtils.query(conn, sql_text=cache_sql, key_name=key_name) cache_broadcast = sc.broadcast(external_cache) self.cache_pools[cache_id] = cache_broadcast return cache_broadcast else: print('= = ' * 10, '[myapp CacheManager.cache_dataset] ' 'configuration warning: found cache is not enabled, with broadcast.enabled = ', broadcast_enabled) return None
def get_api_deviceinfo(): host = '192.168.9.228' port = 3306 db = 'edxapp' user = '******' password = '' # table_name = 'api_deviceinfo' key_name = 'uuid' cache_key_name_list = 'channel, event, uid' # cache_sql = 'select ' + key_name + ', ' + cache_key_name_list + ' from ' + db + '.' + table_name # conn = MySQLUtils.get_connection(host=host, db=db, user=user, password=password, port=port) # external_cache = MySQLUtils.query(conn, sql_text=cache_sql, key_name=key_name) return external_cache
def cache_dataset(self, sc, cache_conf): broadcast_enabled = cache_conf.get('broadcast.enabled', False) if isinstance(broadcast_enabled, bool) and broadcast_enabled: # cache_id = 'ds_id_' + cache_conf['source.id'] + '#cache_id_' + cache_conf['cache.id'] cache_id = 'ds_id_' + str( cache_conf['source.id']) + "#cache_id_" + str( cache_conf['cache.id']) if cache_id in self.cache_pools: print( '= = ' * 10, '[myapp CacheManager.cache_dataset] found dataset has been cached' ) return self.cache_pools[cache_id] else: host = cache_conf['host'] port = cache_conf.get('port', 3306) db = cache_conf['db'] user = cache_conf['user'] password = cache_conf.get('password', '') table_name = cache_conf['tableName'] key_name = cache_conf['keyName'] cache_key_name_list = cache_conf['cache.keyName.list'] cache_sql = 'select ' + key_name + ', ' + cache_key_name_list + ' from ' + db + '.' + table_name conn = MySQLUtils.get_connection(host=host, db=db, user=user, password=password, port=port) external_cache = MySQLUtils.query(conn, sql_text=cache_sql, key_name=key_name) cache_broadcast = sc.broadcast(external_cache) self.cache_pools[cache_id] = cache_broadcast return cache_broadcast else: print( '= = ' * 10, '[myapp CacheManager.cache_dataset] ' 'configuration warning: found cache is not enabled, with broadcast.enabled = ', broadcast_enabled) return None
def fun_userinfo_in_rdd_mapPartitions(iter_x, step_conf, cache_conf): """ 输入: iter[Row] 输出: iter[Row] """ from datetime import datetime, timedelta DATA_JOIN_KEY = 'date_joined' host = cache_conf['host'] port = cache_conf.get('port', 3306) db = cache_conf['db'] user = cache_conf['user'] password = cache_conf.get('password', '') table_name = cache_conf['tableName'] key_name = cache_conf['keyName'] # course_id cache_key_name_list = cache_conf['cache.keyName.list'] cache_sql = 'select ' + key_name + ', ' + cache_key_name_list + ' from ' + db + '.' + table_name query_sql = cache_sql + ' where ' + key_name + ' = %s' # TODO: 支持批量查询 # batch_query_sql = cache_sql + ' where ' + key_name + " in (?)" # TODO: 支持 broadcast 应用启动时cache一份外部缓存 broadcast_enabled = cache_conf.get('broadcast.enabled', False) \ if isinstance(cache_conf.get('broadcast.enabled'), bool) else False # 增量动态缓存 cache_id = 'ds_id_' + str(cache_conf['source.id']) + '#cache_id_' + str(cache_conf['cache.id']) cache_pool = MySQLUtils.cache_pools.setdefault(cache_id, {}) conn = MySQLUtils.get_connection(host=host, db=db, user=user, password=password, port=port) # 更新属性 def proc_update(obj, date_joined=None): obj[DATA_JOIN_KEY] = date_joined if date_joined else '' for row in iter_x: obj = row.asDict() key_value = obj.get(key_name, '').strip() # course_id # 约定 key_value 唯一,为空 if str(key_value) == '': # 没有关联信息处理: web端日志 uuid 为空,不需要关联 mysql.api_deviceinfo 更新 origin_referer, spam # 判断是否更新 proc_update(obj) else: if key_value in cache_pool: key_cache = cache_pool[key_value] # 内存缓存中命中 proc_update(obj, key_cache[DATA_JOIN_KEY]) elif broadcast_enabled and key_value in cache_conf.get('broadcast').value: # broadcast 内存命中 key_cache = cache_conf.get('broadcast').value[key_value] print('= = ' * 10, '[myapp EnhanceUserInfoProcessor.process.fun_userinfo_in_rdd_mapPartitions] found ' 'broadcast_cache = ', key_cache) proc_update(obj, key_cache[DATA_JOIN_KEY]) else: # 如果内存中没有缓存,查询外部缓存 external_cache = MySQLUtils.query( conn=conn, sql_text=query_sql, key_name=key_name, sql_args=(key_value,)) if len(external_cache) == 0: # 外部缓存也没有关联信息时处理: 设置默认值 proc_update(obj) else: # 查到关联信息后处理,先根据关联信息更新字段,再更新课程信息 key_cache = external_cache[key_value] proc_update(obj, key_cache[DATA_JOIN_KEY]) # 更新内存缓存 for k, v in external_cache.iteritems(): cache_pool[k] = v external_cache.clear() yield Row(**obj)
def fun_courseinfo_in_rdd_mapPartitions(iter_x, step_conf, cache_conf): """ 输入: iter[Row] 输出: iter[Row] """ from datetime import datetime, timedelta COURSE_TYPE_KEY = 'course_type' COURSE_OWNER_KEY = 'owner' COURSE_STATUS_KEY = 'status' COURSE_START_KEY = 'start' COURSE_END_KEY = 'end' COURSE_PROCESS_KEY = 'course_process' host = cache_conf['host'] port = cache_conf.get('port', 3306) db = cache_conf['db'] user = cache_conf['user'] password = cache_conf.get('password', '') table_name = cache_conf['tableName'] key_name = cache_conf['keyName'] # course_id cache_key_name_list = cache_conf['cache.keyName.list'] cache_sql = 'select ' + key_name + ', ' + cache_key_name_list + ' from ' + db + '.' + table_name print('= = ' * 10, '[myapp EnhanceCourseInfoProcessor.process.fun_courseinfo_in_rdd_mapPartitions] cache_sql = ' + cache_sql) query_sql = cache_sql + ' where ' + key_name + ' = %s' # TODO: 支持批量查询 batch_query_sql = cache_sql + ' where ' + key_name + " in (?)" # TODO: 支持 broadcast 应用启动时cache一份外部缓存 broadcast_enabled = cache_conf.get('broadcast.enabled', False) \ if isinstance(cache_conf.get('broadcast.enabled'), bool) else False # 增量动态缓存 cache_id = 'ds_id_' + str(cache_conf['source.id']) + '#cache_id_' + str(cache_conf['cache.id']) cache_pool = MySQLUtils.cache_pools.setdefault(cache_id, {}) conn = MySQLUtils.get_connection(host=host, db=db, user=user, password=password, port=port) # def check_process(course_id, course_type, status, start, end, et, course_map): # """ # 离线计算使用 # """ # if course_type == "0": # return -1 if (start is None or start > et or status == "-1") else (1 if (end < et or course_map.has_key(course_id)) else 0) # if course_type == "1": # return 1 if status == "-1" else 0 # return -1 def check_process(course_type, course_status, course_start, course_end, check_date): """ 实时计算使用 et: 取处理记录时的 CST 时间 course_map: 不用判断 """ if course_type == "0": return -1 \ if (course_start is None or course_start > check_date or course_status == "-1") \ else (1 if (course_end < check_date) else 0) elif course_type == "1": return 1 if course_status == "-1" else 0 else: return -1 # 更新属性 def proc_update( obj, course_type=None, course_owner=None, course_status=None, course_start=None, course_end=None, check_date=(datetime.utcnow() + timedelta(hours=8)).strftime('%Y-%m-%d %H:%M:%S')): obj[COURSE_TYPE_KEY] = course_type obj[COURSE_OWNER_KEY] = course_owner obj[COURSE_STATUS_KEY] = course_status obj[COURSE_START_KEY] = course_start obj[COURSE_END_KEY] = course_end course_process = check_process(course_type, course_status, course_start, course_end, check_date) obj[COURSE_PROCESS_KEY] = course_process for row in iter_x: obj = row.asDict() key_value = obj.get(key_name, '').strip() # course_id # 约定 key_value 唯一,为空 if str(key_value) == '': # 没有关联信息处理: web端日志 uuid 为空,不需要关联 mysql.api_deviceinfo 更新 origin_referer, spam # 判断是否更新 proc_update(obj) else: if key_value in cache_pool: key_cache = cache_pool[key_value] # 内存缓存中命中 proc_update(obj, key_cache[COURSE_TYPE_KEY], key_cache[COURSE_OWNER_KEY], key_cache[COURSE_STATUS_KEY], key_cache[COURSE_START_KEY], key_cache[COURSE_END_KEY]) elif broadcast_enabled and key_value in cache_conf.get('broadcast').value: # broadcast 内存命中 key_cache = cache_conf.get('broadcast').value[key_value] print('= = ' * 10, '[myapp EnhanceCourseInfoProcessor.process.fun_courseinfo_in_rdd_mapPartitions] found ' 'broadcast_cache = ', key_cache) proc_update(obj, key_cache[COURSE_TYPE_KEY], key_cache[COURSE_OWNER_KEY], key_cache[COURSE_STATUS_KEY], key_cache[COURSE_START_KEY], key_cache[COURSE_END_KEY]) else: # 如果内存中没有缓存,查询外部缓存 external_cache = MySQLUtils.query( conn=conn, sql_text=query_sql, key_name=key_name, sql_args=(key_value,)) if len(external_cache) == 0: # 外部缓存也没有关联信息时处理: 设置默认值 proc_update(obj) else: # 查到关联信息后处理,先根据关联信息更新字段,再更新课程信息 key_cache = external_cache[key_value] proc_update(obj, key_cache[COURSE_TYPE_KEY], key_cache[COURSE_OWNER_KEY], key_cache[COURSE_STATUS_KEY], key_cache[COURSE_START_KEY], key_cache[COURSE_END_KEY]) # 更新内存缓存 for k, v in external_cache.iteritems(): cache_pool[k] = v external_cache.clear() yield Row(**obj)
def fun_deviceinfo_in_rdd_mapPartitions(iter_x, step_conf, cache_conf): """ 输入: iter[Row] 输出: iter[Row] """ import re domain_pattern = re.compile('https?://(.*?)/.*') ORIGIN_REFERER_KEY = 'origin_referer' CHANNEL_KEY = 'channel' UNKNOWN_ORIGIN_REFERER_VALUE = 'unknown' USER_ID_KEY = 'user_id' UID_KEY = 'uid' SPAM_KEY = 'spam' EVENT_KEY = 'event' UNKNOWN_SPAM_VALUE = 'unknown' host = cache_conf['host'] port = cache_conf.get('port', 3306) db = cache_conf['db'] user = cache_conf['user'] password = cache_conf.get('password', '') table_name = cache_conf['tableName'] key_name = cache_conf['keyName'] cache_key_name_list = cache_conf['cache.keyName.list'] cache_sql = 'select ' + key_name + ', ' + cache_key_name_list + ' from ' + db + '.' + table_name query_sql = cache_sql + ' where ' + key_name + ' = %s' # TODO: 支持批量查询 batch_query_sql = cache_sql + ' where ' + key_name + " in (?)" # TODO: 支持 broadcast 应用启动时cache一份外部缓存 broadcast_enabled = cache_conf.get('broadcast.enabled', False) \ if isinstance(cache_conf.get('broadcast.enabled'), bool) else False # 增量动态缓存 cache_id = 'ds_id_' + str(cache_conf['source.id']) + '#cache_id_' + str(cache_conf['cache.id']) cache_pool = MySQLUtils.cache_pools.setdefault(cache_id, {}) conn = MySQLUtils.get_connection(host=host, db=db, user=user, password=password, port=port) # 更新属性 # 1 更新 origin_referer 规则:如果日志中 origin_referer 为空(null or “”)且设备信息channel字段不为空, # 取设备信息的channel字段值,其他情况取日志中 origin_referer 字段 # Note: origin_referer 暂取 domain # 2 更新 spam 规则:如果用户日志中 spam 为空(null or ””)且设备信息event字段不为空, # 取设备信息的event,其他情况取用户日志中 spam 的值 # 3 更新 user_id 规则: 如果关联到缓存uid,更新;日志中的 user_id 可能不是有效的 def proc_update(obj, cache_channel=None, cache_event=None, cache_uid=None): if not obj[ORIGIN_REFERER_KEY] and cache_channel: obj[ORIGIN_REFERER_KEY] = cache_channel if obj[ORIGIN_REFERER_KEY]: match_result = re.match(domain_pattern, obj[ORIGIN_REFERER_KEY]) if match_result: obj[ORIGIN_REFERER_KEY] = match_result.group(1) if not obj[ORIGIN_REFERER_KEY]: obj[ORIGIN_REFERER_KEY] = UNKNOWN_ORIGIN_REFERER_VALUE if not obj[SPAM_KEY] and cache_event: obj[SPAM_KEY] = cache_event if not obj[SPAM_KEY]: obj[SPAM_KEY] = UNKNOWN_SPAM_VALUE if cache_uid and obj[USER_ID_KEY] != cache_uid: obj[USER_ID_KEY] = cache_uid for row in iter_x: obj = row.asDict() key_value = obj.get(key_name, '').strip() print('= = ' * 10, '[myapp EnhanceApiDeviceInfoProcessor.process.fun_deviceinfo_in_rdd_mapPartitions] found key_value =', key_value, ', obj = ', obj) # 约定 key_value 唯一,为空 if str(key_value) == '': # 没有关联信息处理: web端日志 uuid 为空,不需要关联 mysql.api_deviceinfo 更新 origin_referer, spam # 判断是否更新 proc_update(obj) else: # 先检查内存缓存,再检查broadcast,最后检查外部缓存 if key_value in cache_pool and isinstance(cache_pool[key_value], dict): key_cache = cache_pool[key_value] # 内存缓存中命中 proc_update(obj, key_cache[CHANNEL_KEY], key_cache[EVENT_KEY], key_cache[UID_KEY]) elif broadcast_enabled and key_value in cache_conf.get('broadcast').value: # broadcast 内存命中 key_cache = cache_conf.get('broadcast').value[key_value] print('= = ' * 10, '[myapp EnhanceApiDeviceInfoProcessor.process.fun_deviceinfo_in_rdd_mapPartitions] found ' 'broadcast_cache = ', key_cache) proc_update(obj, key_cache[CHANNEL_KEY], key_cache[EVENT_KEY], key_cache[UID_KEY]) else: # 如果内存中没有缓存,查询外部缓存 external_cache = MySQLUtils.query( conn=conn, sql_text=query_sql, key_name=key_name, sql_args=(key_value,)) print('= = ' * 10, '[myapp EnhanceApiDeviceInfoProcessor.process.fun_deviceinfo_in_rdd_mapPartitions] found ' 'external_cache = ', external_cache) if len(external_cache) == 0: # 外部缓存也没有关联信息时处理: 根据日志中的 origin_referer, spam 字段信息,更新 origin_referer, spam proc_update(obj) else: # 查到关联信息后处理,先根据关联信息更新字段,再更新 origin_referer, spam key_cache = external_cache[key_value] print('= = ' * 10, '[myapp EnhanceApiDeviceInfoProcessor.process.fun_deviceinfo_in_rdd_mapPartitions] found ' 'mysql_cache = ', key_cache) proc_update(obj, key_cache[CHANNEL_KEY], key_cache[EVENT_KEY], key_cache[UID_KEY]) # 更新内存缓存 for k, v in external_cache.iteritems(): cache_pool[k] = v external_cache.clear() yield Row(**obj)
def fun_userinfo_in_rdd_mapPartitions(iter_x, step_conf, cache_conf): """ 输入: iter[Row] 输出: iter[Row] """ from datetime import datetime, timedelta DATA_JOIN_KEY = 'date_joined' host = cache_conf['host'] port = cache_conf.get('port', 3306) db = cache_conf['db'] user = cache_conf['user'] password = cache_conf.get('password', '') table_name = cache_conf['tableName'] key_name = cache_conf['keyName'] # course_id cache_key_name_list = cache_conf['cache.keyName.list'] cache_sql = 'select ' + key_name + ', ' + cache_key_name_list + ' from ' + db + '.' + table_name query_sql = cache_sql + ' where ' + key_name + ' = %s' # TODO: 支持批量查询 # batch_query_sql = cache_sql + ' where ' + key_name + " in (?)" # TODO: 支持 broadcast 应用启动时cache一份外部缓存 broadcast_enabled = cache_conf.get('broadcast.enabled', False) \ if isinstance(cache_conf.get('broadcast.enabled'), bool) else False # 增量动态缓存 cache_id = 'ds_id_' + str( cache_conf['source.id']) + '#cache_id_' + str( cache_conf['cache.id']) cache_pool = MySQLUtils.cache_pools.setdefault(cache_id, {}) conn = MySQLUtils.get_connection(host=host, db=db, user=user, password=password, port=port) # 更新属性 def proc_update(obj, date_joined=None): obj[DATA_JOIN_KEY] = date_joined if date_joined else '' for row in iter_x: obj = row.asDict() key_value = obj.get(key_name, '').strip() # course_id # 约定 key_value 唯一,为空 if str(key_value) == '': # 没有关联信息处理: web端日志 uuid 为空,不需要关联 mysql.api_deviceinfo 更新 origin_referer, spam # 判断是否更新 proc_update(obj) else: if key_value in cache_pool: key_cache = cache_pool[key_value] # 内存缓存中命中 proc_update(obj, key_cache[DATA_JOIN_KEY]) elif broadcast_enabled and key_value in cache_conf.get( 'broadcast').value: # broadcast 内存命中 key_cache = cache_conf.get('broadcast').value[key_value] print( '= = ' * 10, '[myapp EnhanceUserInfoProcessor.process.fun_userinfo_in_rdd_mapPartitions] found ' 'broadcast_cache = ', key_cache) proc_update(obj, key_cache[DATA_JOIN_KEY]) else: # 如果内存中没有缓存,查询外部缓存 external_cache = MySQLUtils.query(conn=conn, sql_text=query_sql, key_name=key_name, sql_args=(key_value, )) if len(external_cache) == 0: # 外部缓存也没有关联信息时处理: 设置默认值 proc_update(obj) else: # 查到关联信息后处理,先根据关联信息更新字段,再更新课程信息 key_cache = external_cache[key_value] proc_update(obj, key_cache[DATA_JOIN_KEY]) # 更新内存缓存 for k, v in external_cache.iteritems(): cache_pool[k] = v external_cache.clear() yield Row(**obj)
def fun_courseinfo_in_rdd_mapPartitions(iter_x, step_conf, cache_conf): """ 输入: iter[Row] 输出: iter[Row] """ from datetime import datetime, timedelta COURSE_TYPE_KEY = 'course_type' COURSE_OWNER_KEY = 'owner' COURSE_STATUS_KEY = 'status' COURSE_START_KEY = 'start' COURSE_END_KEY = 'end' COURSE_PROCESS_KEY = 'course_process' host = cache_conf['host'] port = cache_conf.get('port', 3306) db = cache_conf['db'] user = cache_conf['user'] password = cache_conf.get('password', '') table_name = cache_conf['tableName'] key_name = cache_conf['keyName'] # course_id cache_key_name_list = cache_conf['cache.keyName.list'] cache_sql = 'select ' + key_name + ', ' + cache_key_name_list + ' from ' + db + '.' + table_name print( '= = ' * 10, '[myapp EnhanceCourseInfoProcessor.process.fun_courseinfo_in_rdd_mapPartitions] cache_sql = ' + cache_sql) query_sql = cache_sql + ' where ' + key_name + ' = %s' # TODO: 支持批量查询 batch_query_sql = cache_sql + ' where ' + key_name + " in (?)" # TODO: 支持 broadcast 应用启动时cache一份外部缓存 broadcast_enabled = cache_conf.get('broadcast.enabled', False) \ if isinstance(cache_conf.get('broadcast.enabled'), bool) else False # 增量动态缓存 cache_id = 'ds_id_' + str( cache_conf['source.id']) + '#cache_id_' + str( cache_conf['cache.id']) cache_pool = MySQLUtils.cache_pools.setdefault(cache_id, {}) conn = MySQLUtils.get_connection(host=host, db=db, user=user, password=password, port=port) # def check_process(course_id, course_type, status, start, end, et, course_map): # """ # 离线计算使用 # """ # if course_type == "0": # return -1 if (start is None or start > et or status == "-1") else (1 if (end < et or course_map.has_key(course_id)) else 0) # if course_type == "1": # return 1 if status == "-1" else 0 # return -1 def check_process(course_type, course_status, course_start, course_end, check_date): """ 实时计算使用 et: 取处理记录时的 CST 时间 course_map: 不用判断 """ if course_type == "0": return -1 \ if (course_start is None or course_start > check_date or course_status == "-1") \ else (1 if (course_end < check_date) else 0) elif course_type == "1": return 1 if course_status == "-1" else 0 else: return -1 # 更新属性 def proc_update( obj, course_type=None, course_owner=None, course_status=None, course_start=None, course_end=None, check_date=(datetime.utcnow() + timedelta(hours=8)).strftime('%Y-%m-%d %H:%M:%S')): obj[COURSE_TYPE_KEY] = course_type obj[COURSE_OWNER_KEY] = course_owner obj[COURSE_STATUS_KEY] = course_status obj[COURSE_START_KEY] = course_start obj[COURSE_END_KEY] = course_end course_process = check_process(course_type, course_status, course_start, course_end, check_date) obj[COURSE_PROCESS_KEY] = course_process for row in iter_x: obj = row.asDict() key_value = obj.get(key_name, '').strip() # course_id # 约定 key_value 唯一,为空 if str(key_value) == '': # 没有关联信息处理: web端日志 uuid 为空,不需要关联 mysql.api_deviceinfo 更新 origin_referer, spam # 判断是否更新 proc_update(obj) else: if key_value in cache_pool: key_cache = cache_pool[key_value] # 内存缓存中命中 proc_update(obj, key_cache[COURSE_TYPE_KEY], key_cache[COURSE_OWNER_KEY], key_cache[COURSE_STATUS_KEY], key_cache[COURSE_START_KEY], key_cache[COURSE_END_KEY]) elif broadcast_enabled and key_value in cache_conf.get( 'broadcast').value: # broadcast 内存命中 key_cache = cache_conf.get('broadcast').value[key_value] print( '= = ' * 10, '[myapp EnhanceCourseInfoProcessor.process.fun_courseinfo_in_rdd_mapPartitions] found ' 'broadcast_cache = ', key_cache) proc_update(obj, key_cache[COURSE_TYPE_KEY], key_cache[COURSE_OWNER_KEY], key_cache[COURSE_STATUS_KEY], key_cache[COURSE_START_KEY], key_cache[COURSE_END_KEY]) else: # 如果内存中没有缓存,查询外部缓存 external_cache = MySQLUtils.query(conn=conn, sql_text=query_sql, key_name=key_name, sql_args=(key_value, )) if len(external_cache) == 0: # 外部缓存也没有关联信息时处理: 设置默认值 proc_update(obj) else: # 查到关联信息后处理,先根据关联信息更新字段,再更新课程信息 key_cache = external_cache[key_value] proc_update(obj, key_cache[COURSE_TYPE_KEY], key_cache[COURSE_OWNER_KEY], key_cache[COURSE_STATUS_KEY], key_cache[COURSE_START_KEY], key_cache[COURSE_END_KEY]) # 更新内存缓存 for k, v in external_cache.iteritems(): cache_pool[k] = v external_cache.clear() yield Row(**obj)
def fun_deviceinfo_in_rdd_mapPartitions(iter_x, step_conf, cache_conf): """ 输入: iter[Row] 输出: iter[Row] """ import re domain_pattern = re.compile('https?://(.*?)/.*') ORIGIN_REFERER_KEY = 'origin_referer' CHANNEL_KEY = 'channel' UNKNOWN_ORIGIN_REFERER_VALUE = 'unknown' USER_ID_KEY = 'user_id' UID_KEY = 'uid' SPAM_KEY = 'spam' EVENT_KEY = 'event' UNKNOWN_SPAM_VALUE = 'unknown' host = cache_conf['host'] port = cache_conf.get('port', 3306) db = cache_conf['db'] user = cache_conf['user'] password = cache_conf.get('password', '') table_name = cache_conf['tableName'] key_name = cache_conf['keyName'] cache_key_name_list = cache_conf['cache.keyName.list'] cache_sql = 'select ' + key_name + ', ' + cache_key_name_list + ' from ' + db + '.' + table_name query_sql = cache_sql + ' where ' + key_name + ' = %s' # TODO: 支持批量查询 batch_query_sql = cache_sql + ' where ' + key_name + " in (?)" # TODO: 支持 broadcast 应用启动时cache一份外部缓存 broadcast_enabled = cache_conf.get('broadcast.enabled', False) \ if isinstance(cache_conf.get('broadcast.enabled'), bool) else False # 增量动态缓存 cache_id = 'ds_id_' + str( cache_conf['source.id']) + '#cache_id_' + str( cache_conf['cache.id']) cache_pool = MySQLUtils.cache_pools.setdefault(cache_id, {}) conn = MySQLUtils.get_connection(host=host, db=db, user=user, password=password, port=port) # 更新属性 # 1 更新 origin_referer 规则:如果日志中 origin_referer 为空(null or “”)且设备信息channel字段不为空, # 取设备信息的channel字段值,其他情况取日志中 origin_referer 字段 # Note: origin_referer 暂取 domain # 2 更新 spam 规则:如果用户日志中 spam 为空(null or ””)且设备信息event字段不为空, # 取设备信息的event,其他情况取用户日志中 spam 的值 # 3 更新 user_id 规则: 如果关联到缓存uid,更新;日志中的 user_id 可能不是有效的 def proc_update(obj, cache_channel=None, cache_event=None, cache_uid=None): if not obj[ORIGIN_REFERER_KEY] and cache_channel: obj[ORIGIN_REFERER_KEY] = cache_channel if obj[ORIGIN_REFERER_KEY]: match_result = re.match(domain_pattern, obj[ORIGIN_REFERER_KEY]) if match_result: obj[ORIGIN_REFERER_KEY] = match_result.group(1) if not obj[ORIGIN_REFERER_KEY]: obj[ORIGIN_REFERER_KEY] = UNKNOWN_ORIGIN_REFERER_VALUE if not obj[SPAM_KEY] and cache_event: obj[SPAM_KEY] = cache_event if not obj[SPAM_KEY]: obj[SPAM_KEY] = UNKNOWN_SPAM_VALUE if cache_uid and obj[USER_ID_KEY] != cache_uid: obj[USER_ID_KEY] = cache_uid for row in iter_x: obj = row.asDict() key_value = obj.get(key_name, '').strip() print( '= = ' * 10, '[myapp EnhanceApiDeviceInfoProcessor.process.fun_deviceinfo_in_rdd_mapPartitions] found key_value =', key_value, ', obj = ', obj) # 约定 key_value 唯一,为空 if str(key_value) == '': # 没有关联信息处理: web端日志 uuid 为空,不需要关联 mysql.api_deviceinfo 更新 origin_referer, spam # 判断是否更新 proc_update(obj) else: # 先检查内存缓存,再检查broadcast,最后检查外部缓存 if key_value in cache_pool and isinstance( cache_pool[key_value], dict): key_cache = cache_pool[key_value] # 内存缓存中命中 proc_update(obj, key_cache[CHANNEL_KEY], key_cache[EVENT_KEY], key_cache[UID_KEY]) elif broadcast_enabled and key_value in cache_conf.get( 'broadcast').value: # broadcast 内存命中 key_cache = cache_conf.get('broadcast').value[key_value] print( '= = ' * 10, '[myapp EnhanceApiDeviceInfoProcessor.process.fun_deviceinfo_in_rdd_mapPartitions] found ' 'broadcast_cache = ', key_cache) proc_update(obj, key_cache[CHANNEL_KEY], key_cache[EVENT_KEY], key_cache[UID_KEY]) else: # 如果内存中没有缓存,查询外部缓存 external_cache = MySQLUtils.query(conn=conn, sql_text=query_sql, key_name=key_name, sql_args=(key_value, )) print( '= = ' * 10, '[myapp EnhanceApiDeviceInfoProcessor.process.fun_deviceinfo_in_rdd_mapPartitions] found ' 'external_cache = ', external_cache) if len(external_cache) == 0: # 外部缓存也没有关联信息时处理: 根据日志中的 origin_referer, spam 字段信息,更新 origin_referer, spam proc_update(obj) else: # 查到关联信息后处理,先根据关联信息更新字段,再更新 origin_referer, spam key_cache = external_cache[key_value] print( '= = ' * 10, '[myapp EnhanceApiDeviceInfoProcessor.process.fun_deviceinfo_in_rdd_mapPartitions] found ' 'mysql_cache = ', key_cache) proc_update(obj, key_cache[CHANNEL_KEY], key_cache[EVENT_KEY], key_cache[UID_KEY]) # 更新内存缓存 for k, v in external_cache.iteritems(): cache_pool[k] = v external_cache.clear() yield Row(**obj)