def parse(out_list=[]): # 输出结果 # [uid,sessionid,stepid,time,query,result,click,request,response,position,cellphone] arr_len = len(out_list) if len(out_list) < 6: return '' if 'poi_ids_list' in out_list[5]: del out_list[5]['poi_ids_list'] for i in xrange(arr_len): tmp_type = type(out_list[i]) if tmp_type == type({}): out_list[i] = func.dict2str(out_list[i]) elif tmp_type == type([]): out_list[i] = '|'.join(out_list[i]) elif tmp_type == type(100): out_list[i] = str(out_list[i]) else: pass return '\t'.join(out_list)
def main(input=sys.stdin, output=sys.stdout): format_list = [ 'uid', 'sessionid', 'stepid', 'time', 'position', 'source', 'action', 'request', 'response', 'cellphone', 'other', 'dt' ] out_dict = { 'uid': '-', 'sessionid': '-', 'stepid': '-', 'time': '-', 'position': '-', 'query': '-', 'result': '-', 'click': '-', 'cellphone': '-' } out_list = [ 'uid', 'sessionid', 'stepid', 'time', 'position', 'query', 'result', 'click', 'cellphone' ] pattern_uid = re.compile(r'^[\w-]+$', re.I) last_key = '-' session_list = [] for line in input: # in: [uid sessionid stepid time position source action request response cellphone other] query_dict = {} result_dict = {} click_dict = {} arr = [i.strip() for i in line.strip().split('\t')] if len(arr) != len(format_list): #print >>sys.stderr,'line length error ! %s!=%s \nline=(%s)'%(len(arr),len(format_list),line) continue line_dict = dict(zip(format_list, arr)) for i in ('position', 'request', 'response', 'cellphone', 'other'): line_dict[i] = json.loads(line_dict[i]) uid = line_dict['uid'] sessionid = line_dict['sessionid'] # 获取stepid try: stepid = int(line_dict['stepid']) except Exception, err: stepid = -1 # 获取time time = line_dict['time'] source = line_dict['source'] # 获取query_type query_dict['query_type'] = line_dict['action'] # 获取citycode #citycode = line_dict['position']['citycode'] if 'citycode' in line_dict['position'] else '-' if source == 'SP': # 提取信息: query(keywords,category),count,poi_ids # 获取检索query(keywords,category) for i in ('keywords', 'category'): # keywords,category可能都存在 if i in line_dict['request']: query_dict[i] = func.get_value(line_dict['request'], i) if len(query_dict) < 2: # keywords,category均不存在时,跳过这条记录 continue # 获取检索结果列表 for i in ('count', 'poi_ids'): result_dict[i] = func.get_value( line_dict['response'], i) if i in line_dict['response'] else '-' # 2014-6-13 修复bug:无结果时,点击位置为1,因为poi_ids_list=['-'],,poiid='-'时误打误撞匹配上 if result_dict['poi_ids'] == '-': poi_ids_list = [] else: poi_ids_list = result_dict['poi_ids'].split('&') result_dict['poi_ids_list'] = poi_ids_list # 冗余信息 request_str = func.dict2str(line_dict['request']) response_str = func.dict2str(line_dict['response']) position_str = func.dict2str(line_dict['position']) cellphone_str = func.dict2str(line_dict['cellphone']) elif source == 'AOS': # 提取信息: poiid if not 'poiid' in line_dict['request']: # poiid不存在时,直接跳过 continue poiid = func.get_value(line_dict['request'], 'poiid') click_dict['poiid'] = poiid # 冗余信息,补充到request中 click_dict['time'] = time click_dict['stepid'] = stepid else: # 异常类别,直接过滤 continue pass uid_pass = 0 len_sp = len(session_list) # 过滤异常用户 freq > 1w ,防止数据倾斜 if len_sp > 10000: session_list = [] uid_pass = 1 # 输出:[uid,sessionid,stepid,time,query,result,click,request,response,position,cellphone] # uid+sessionid 作为key cur_key = uid + '\t' + sessionid if cur_key == last_key: if line_dict['source'] == 'SP': if uid_pass: continue # key相同,追加记录到query-list中 session_list.append([ uid, sessionid, stepid, time, query_dict, result_dict, { 'num': 0 }, request_str, response_str, position_str, cellphone_str ]) elif line_dict['source'] == 'AOS': # 根据stepid,time降序排序 #session_list.sort(key=lambda x:(x[2],x[3]),reverse=True) # 拿poiid去query-list查找展现信息: stepid=-1,去整个list中查;stepid!=-1,去前几步查 #for i,item in enumerate(session_list[::-1]): # 2014-6-4 修复bug,点击poi不再展现list中,占比27%(25w/93w) for j in xrange(len_sp): # session_list按照stepid倒序查找 i = -(j + 1) item = session_list[i] if stepid != -1 and stepid <= item[2]: # stepid有效时,需要限制查找范围 continue tmp_poi_list = item[5]['poi_ids_list'] if poiid in tmp_poi_list: pos = tmp_poi_list.index(poiid) + 1 # 多点击、重复点击情形处理 2014-6-18 order = session_list[i][6]['num'] + 1 session_list[i][6]['num'] = order # 记录点击信息(含多次点击):[order stepid time poiid pos] &&分隔展现项,||分隔展现元素 if order <= 1: # 首次点击 session_list[i][6]['click_list'] = '|'.join([ str(order), str(stepid), time, poiid, str(pos) ]) else: # 非首次点击 session_list[i][6]['click_list'] += '&' + '|'.join( [ str(order), str(stepid), time, poiid, str(pos) ]) #session_list[i][6]['poiid'] = poiid #session_list[i][6]['pos'] = str(pos) break # 匹配成功后,停止查找---一个点击匹配最多一次检索行为 else: # key变化,清空当前query-list,初始化 #session_list.sort(key=lambda x:(x[2],x[3]),reverse=True) for item in session_list: print parse(item) uid_pass = 0 session_list = [] # 2014-6-23 bug修复,原来置空仅限source=sp if source == 'SP': session_list.append([ uid, sessionid, stepid, time, query_dict, result_dict, { 'num': 0 }, request_str, response_str, position_str, cellphone_str ]) last_key = cur_key elif source == 'AOS': # AOS有点击,但SP无记录 print >> sys.stderr, 'only in AOS (%s)' % (cur_key) pass else: continue pass
position_dict['user_loc_city'] = user_loc_city # 手机相关信息 for i in cellphone_list: if i in cifa_dict: if i == 'div': # [2014-5-21] div 大写 cellphone_dict[i] = func.get_value(cifa_dict,i).upper() else: cellphone_dict[i] = func.get_value(cifa_dict,i) else: if i == 'div': cellphone_dict[i] = func.get_value(request_dict,i).upper() else: cellphone_dict[i] = func.get_value(request_dict,i) #2014-09-05 cifa_dict->request_dict # 2014-6-17 新增字段is_general标记是否泛需求 if 'keywords' in request_dict: if request_dict['keywords'] in general_dict: request_dict['is_general'] = '1' request_dict['cifa'] = func.dict2str(cifa_dict,';','=') for i in ('position','request','response','cellphone','other'): out_dict[i] = func.dict2str(eval("%s_dict"%(i))) try: print >>output,'\t'.join([out_dict[i] for i in out_list]) except Exception,err: print >>sys.stderr,'out_dict print error! (%s)'%(repr(out_dict)) pass #print json.dumps(tmp_dict,ensure_ascii=False,encoding='utf-8',indent=4) if __name__ == '__main__': main()
#out_dict['action'] = in_dict['path'].rstrip('/') # action,去掉右侧/ out_dict['request'] = tmp_para # request out_dict['request'].update(in_dict) # 2014-6-13 6.11增加冗余字段失效,导致aos格式偏移,原因是正则错误,包含了tab和分区字段 #out_dict['other'] = {} # info #2014-09-02 不要重置字典,会把已有内容清空 # [2014-5-16] 将中文解释添加进去 out_dict['other']['action_name'] = action_name #out_dict['other']['diu2'] = func.get_value(tmp_para,'diu2') #2014-09-02前面已取ine68-83 #out_dict['other']['diu3'] = func.get_value(tmp_para,'diu3') #2014-09-02前面已取ine68-83 out_dict['response'] = {} # response,aos不记录返回信息 # pack result: out_str = '-' for i in out_list: if i not in out_dict: #20141112,garnett #print >>sys.stderr,'key(%s) not in out_dict!'%(i) #func.counter('Count','out_dict miss',1) continue v = out_dict[i] if type(v) == type({}): if out_str == '-': out_str = func.dict2str(v) else: out_str += '\t' + func.dict2str(v) else: out_str += '\t' + v print >>output,out_str.lstrip('-\t') if __name__ == '__main__': main()
# sessionid,stepid for i in ('sessionid', 'stepid'): out_dict[i] = func.get_value(request_dict, i) # time if len(in_dict['time']) == 14: date = in_dict['time'][:8] time = in_dict['time'][8:10] + ':' + in_dict['time'][ 10:12] + ':' + in_dict['time'][12:] else: date, time = '-', '-' out_dict['time'] = time other_dict['date'] = date # position for i in ('geoobj', 'user_loc', 'user_city'): if i == 'geoobj': # [2014-5-21]geoobj分隔符替换: | --> ; position_dict[i] = func.get_value(request_dict, i).replace('|', ';') else: position_dict[i] = func.get_value(request_dict, i) out_dict['action'] = func.get_value(request_dict, 'query_type') # cellphone for i in ('div', 'platform'): if i == 'div': # [2014-5-21] div大写 cellphone_dict[i] = func.get_value(request_dict, i).upper() else: cellphone_dict[i] = func.get_value(request_dict, i) for i in ('position', 'request', 'response', 'cellphone', 'other'): out_dict[i] = func.dict2str(eval("%s_dict" % (i))) print '\t'.join([out_dict[i] for i in out_list])
other_dict['date'] = t_list[0].replace( '-', '') # [2014-3-17] 2014-03-17 --> 20140317 if line_dict['para'] != '': try: #para_dict = json.loads(arr[13]) para_dict = json.loads(func.get_value(line_dict, 'para')) except Exception, err: #print >>sys.stderr,'[error] json data:para=[%s]'%(arr[13]) func.counter('Count', 'para miss', 1) continue else: #print >>sys.stderr,'para empty!(%s)'%(line) para_dict = {} pass # 手机相关信息 for i in cellphone_list: cellphone_dict[i] = func.get_value(line_dict, i) # 请求信息 request_dict = line_dict out_dict['position'] = func.dict2str(position_dict) out_dict['request'] = func.dict2str(request_dict) out_dict['response'] = func.dict2str(para_dict) out_dict['cellphone'] = func.dict2str(cellphone_dict) out_dict['other'] = func.dict2str(other_dict) print >> output, '\t'.join([out_dict[i] for i in out_list]) #print json.dumps(tmp_dict,ensure_ascii=False,encoding='utf-8',indent=4) if __name__ == '__main__': main()