def _check_moves(board, pos): value = get_value(board, pos) if pos[0] < 0 or pos[0] >= len(board) or pos[1] < 0 or pos[1] >= len( board) or value != 0: return False return True
def main(input=sys.stdin, output=sys.stdout): format_list = [ 'uid', 'sessionid', 'stepid', 'time', 'position', 'source', 'action', 'request', 'response', 'cellphone', 'other', 'dt' ] out_dict = { 'uid': '-', 'sessionid': '-', 'stepid': '-', 'time': '-', 'position': '-', 'query': '-', 'result': '-', 'click': '-', 'cellphone': '-' } out_list = [ 'uid', 'sessionid', 'stepid', 'time', 'position', 'query', 'result', 'click', 'cellphone' ] pattern_uid = re.compile(r'^[\w-]+$', re.I) last_key = '-' session_list = [] for line in input: # in: [uid sessionid stepid time position source action request response cellphone other] query_dict = {} result_dict = {} click_dict = {} arr = [i.strip() for i in line.strip().split('\t')] if len(arr) != len(format_list): #print >>sys.stderr,'line length error ! %s!=%s \nline=(%s)'%(len(arr),len(format_list),line) continue line_dict = dict(zip(format_list, arr)) for i in ('position', 'request', 'response', 'cellphone', 'other'): line_dict[i] = json.loads(line_dict[i]) uid = line_dict['uid'] sessionid = line_dict['sessionid'] # 获取stepid try: stepid = int(line_dict['stepid']) except Exception, err: stepid = -1 # 获取time time = line_dict['time'] source = line_dict['source'] # 获取query_type query_dict['query_type'] = line_dict['action'] # 获取citycode #citycode = line_dict['position']['citycode'] if 'citycode' in line_dict['position'] else '-' if source == 'SP': # 提取信息: query(keywords,category),count,poi_ids # 获取检索query(keywords,category) for i in ('keywords', 'category'): # keywords,category可能都存在 if i in line_dict['request']: query_dict[i] = func.get_value(line_dict['request'], i) if len(query_dict) < 2: # keywords,category均不存在时,跳过这条记录 continue # 获取检索结果列表 for i in ('count', 'poi_ids'): result_dict[i] = func.get_value( line_dict['response'], i) if i in line_dict['response'] else '-' # 2014-6-13 修复bug:无结果时,点击位置为1,因为poi_ids_list=['-'],,poiid='-'时误打误撞匹配上 if result_dict['poi_ids'] == '-': poi_ids_list = [] else: poi_ids_list = result_dict['poi_ids'].split('&') result_dict['poi_ids_list'] = poi_ids_list # 冗余信息 request_str = func.dict2str(line_dict['request']) response_str = func.dict2str(line_dict['response']) position_str = func.dict2str(line_dict['position']) cellphone_str = func.dict2str(line_dict['cellphone']) elif source == 'AOS': # 提取信息: poiid if not 'poiid' in line_dict['request']: # poiid不存在时,直接跳过 continue poiid = func.get_value(line_dict['request'], 'poiid') click_dict['poiid'] = poiid # 冗余信息,补充到request中 click_dict['time'] = time click_dict['stepid'] = stepid else: # 异常类别,直接过滤 continue pass uid_pass = 0 len_sp = len(session_list) # 过滤异常用户 freq > 1w ,防止数据倾斜 if len_sp > 10000: session_list = [] uid_pass = 1 # 输出:[uid,sessionid,stepid,time,query,result,click,request,response,position,cellphone] # uid+sessionid 作为key cur_key = uid + '\t' + sessionid if cur_key == last_key: if line_dict['source'] == 'SP': if uid_pass: continue # key相同,追加记录到query-list中 session_list.append([ uid, sessionid, stepid, time, query_dict, result_dict, { 'num': 0 }, request_str, response_str, position_str, cellphone_str ]) elif line_dict['source'] == 'AOS': # 根据stepid,time降序排序 #session_list.sort(key=lambda x:(x[2],x[3]),reverse=True) # 拿poiid去query-list查找展现信息: stepid=-1,去整个list中查;stepid!=-1,去前几步查 #for i,item in enumerate(session_list[::-1]): # 2014-6-4 修复bug,点击poi不再展现list中,占比27%(25w/93w) for j in xrange(len_sp): # session_list按照stepid倒序查找 i = -(j + 1) item = session_list[i] if stepid != -1 and stepid <= item[2]: # stepid有效时,需要限制查找范围 continue tmp_poi_list = item[5]['poi_ids_list'] if poiid in tmp_poi_list: pos = tmp_poi_list.index(poiid) + 1 # 多点击、重复点击情形处理 2014-6-18 order = session_list[i][6]['num'] + 1 session_list[i][6]['num'] = order # 记录点击信息(含多次点击):[order stepid time poiid pos] &&分隔展现项,||分隔展现元素 if order <= 1: # 首次点击 session_list[i][6]['click_list'] = '|'.join([ str(order), str(stepid), time, poiid, str(pos) ]) else: # 非首次点击 session_list[i][6]['click_list'] += '&' + '|'.join( [ str(order), str(stepid), time, poiid, str(pos) ]) #session_list[i][6]['poiid'] = poiid #session_list[i][6]['pos'] = str(pos) break # 匹配成功后,停止查找---一个点击匹配最多一次检索行为 else: # key变化,清空当前query-list,初始化 #session_list.sort(key=lambda x:(x[2],x[3]),reverse=True) for item in session_list: print parse(item) uid_pass = 0 session_list = [] # 2014-6-23 bug修复,原来置空仅限source=sp if source == 'SP': session_list.append([ uid, sessionid, stepid, time, query_dict, result_dict, { 'num': 0 }, request_str, response_str, position_str, cellphone_str ]) last_key = cur_key elif source == 'AOS': # AOS有点击,但SP无记录 print >> sys.stderr, 'only in AOS (%s)' % (cur_key) pass else: continue pass
def main( input = sys.stdin , output = sys.stdout ): ''' 235702000010 addr_poi_merge:true+aos_version:2.12+app:sp_app+auto_cluster:false+cifa:800270049e7fd106ae72d101000000cc010300000000000000000000000000000000000000000000000000070000000500372e302e3409006950686f6e65362c3105004150504c450000000000000000000000000500+citysuggestion:true+data_type:poi+dic:c3320+dip:10920+diu:ac9b6b64-959e-4046-8b4b-615198ba06c1+diu2:63a8c5d9-93f2-4f6b-bad4-292e40e665a8+diu3:3908e66368f8407d6db29e833fd06abf82486d2b+div:iosh060400+expand_range:false+group_by_category:false+group_by_click:true+group_by_find_good_around_bus_station:true+group_by_name_component_result:true+group_by_parent:true+group_by_pos_standrand_order:true+group_by_whole_match:true+group_by_xy:true+group_by_xy_and_field:true+keywords:�ư�+location:true+name_replace:true+need_expand_range:true+noseg:parent;pguid;nid;brand_id;brand+page:1+page_num:10+qii:true+qii_server_port:14001+query_busline:true+query_channel:true+query_road:true+query_scene:category+query_src:amap6+query_type:rqbxy+range:5000.0+route_plan:true+search_operate:2+server_port:13333+session:104715881+show_fields:all+sort_filter:true+stepid:151+use_log:true+user_info:ac9b6b64-959e-4046-8b4b-615198ba06c1+user_loc:114.398544,30.501300+x:114.397759+y:30.500967+queryid=22263ea2-1b23-4617-bbdf-54b14df85c30 from:10.25.71.209+poi_ids:B001B0J0GS&B001B1ITO9&B0FFF0EGIZ&B0FFF0DQSJ&B001B16VTB&B0FFF0DQSI&B0FFF0EGWP&B0FFF0EWB2&B001B18O79&B001B1GZOU+qii_querytype:5+count:67+searchtime:81+totaltime:156 ''' # out ---> 【uid(用户标识,string) time(时间,string) position(地点,map) source(数据源,string) action(动作类别,string) request(请求信息,map) response(响应信息,map) other(其他信息,map)】 format_list = ['tm','request','response','dt'] cellphone_list = ['div','model','device','manufacture'] out_dict = {'uid':'-','sessionid':'-','stepid':'-','time':'-','position':'-','source':'SP','action':'-','request':'-','response':'-','cellphone':'-','other':'-'} out_list = ['uid','sessionid','stepid','time','position','source','action','request','response','cellphone','other'] pattern_uid = re.compile(r'^[\w-]+$',re.I) #illegal_uid = ('NULL','unknown','0','aos') # 2014.5.22 无效uid #illegal_uid = ('NULL','unknown','353021051343571','0','000000000000000','111111111111111','aos') # 2014.5.22 无效uid illegal_uid = ('NULL','unknown','aos') # 加载邮政编码映射字典 city_dict = func.adcode2citycode() #print city_dict['adname'].keys() #print '|'.join(city_dict['adname'].keys()) # 2014-6-17 加载泛需求字典 general_dict = func.loadGeneralDict() #print '|'.join(general_dict.keys()) #print format_list for line in input: position_dict = {} request_dict = {} response_dict = {} cellphone_dict = {} other_dict = {} ''' # 2014.5.22 原始日志utf8编码,停止转换,否则造成部分数据乱码 # [2014-5-8] 5月8日以后的日志才是flume utf-8编码,之前是ftp方式直接上传原始日志(gbk) # 编码转换: gbk -> utf8 try: line = line.decode('gbk').encode('utf8') except Exception,err: pass # 2014-5-23 if line.find('\t') != -1: arr = [ i.strip() for i in line.strip().split('\t') ] else: # 兼容2014-5-8转码后异常数据,空格分隔 arr = [ i.strip() for i in line.strip().split(' ') ] ''' arr = [ i.strip() for i in line.strip().split('\t') ] if len(arr) != len(format_list): func.counter('Count','line length error',1) #print >>sys.stderr,'line length error ! %s!=%s \nline=(%s)'%(len(arr),len(format_list),line) continue line_dict = dict(zip(format_list,arr)) line_dict['request'] = json.loads(arr[1]) line_dict['response'] = json.loads(arr[2]) tm = line_dict['tm'] if len(tm) > 6: out_dict['time'] = tm[0:2]+':'+tm[2:4]+':'+tm[4:6] request_dict = line_dict['request'] response_dict = line_dict['response'] if not request_dict or 'query_type' not in request_dict: func.counter('Count','query_type miss',1) continue #2014-11-19 过滤抓取日志 if request_dict.get('user_info','-') == 'test' and request_dict.get('query_src','-') == 'test': continue if 'user_info' in request_dict and pattern_uid.match(request_dict['user_info']): uid = func.get_value(request_dict,'user_info') elif 'diu' in request_dict and pattern_uid.match(request_dict['diu']): uid = func.get_value(request_dict,'diu') else: # 2014-6-17 用户标识缺失时,直接跳过 continue # 2014-6-17,过滤码点日志,占全量日志的1/3 if request_dict['query_type'] == 'indoor_slayer': continue # 2014.5.22 if uid in illegal_uid: func.counter('Count','uid error',1) continue out_dict['uid'] = uid #2014-09-05 不强制转大写 #2014-11-14 add sessionid if 'session' in request_dict: out_dict['sessionid'] = func.get_value(request_dict,'session') elif 'sessionid' in request_dict: out_dict['sessionid'] = func.get_value(request_dict,'sessionid') else: out_dict['sessionid'] = '-' if 'step' in request_dict: out_dict['stepid'] = func.get_value(request_dict,'step') elif 'stepid' in request_dict: out_dict['stepid'] = func.get_value(request_dict,'stepid') else: out_dict['stepid'] = '-' out_dict['action'] = func.get_value(request_dict,'query_type').upper() if 'data_type' in request_dict: request_dict['data_type'] = request_dict['data_type'].upper() other_dict['data_type'] = request_dict['data_type'].upper() # 2014-09-05 add data_type to other # cifa解析 2014-5-5 cifa_dict = {} if 'cifa' in request_dict: cifa_str = func.get_value(request_dict,'cifa') cifa_dict = cifa.parse_cifa(cifa_str) # 解密cifa for i in ('diu','diu2','diu3'): other_dict[i] = func.get_value(request_dict,i) for i in ('x','y','user_loc','geoobj'): position_dict[i] = func.get_value(request_dict,i) # 用cifa中的lon,lat填充user_loc值 if i == 'user_loc' and position_dict[i] == '-' and 'lon' in cifa_dict and 'lat' in cifa_dict: position_dict[i] = str(float(cifa_dict['lon'])/10**6)+','+str(float(cifa_dict['lat'])/10**6) # RGEOCODE 2014-5-7 geo_list = [] citycode = '-' if request_dict.has_key('city'): # 请求数据中自带city时,做相应转换后,赋给citycode city = func.get_value(request_dict,'city').strip() # city中文后有空格,去掉 2014-5-15 # 根据不同情形做相应转换 len_city = len(city) if len_city <= 1: # 2014-6-17 '==' -> '<=',校正city为空的情形 city = '-' elif city.isdigit(): if len_city == 2 or ( len_city == 3 and not city.startswith('0') ): city = '0' + city elif len_city == 6: # 邮政编码转城市编码 try: city = city_dict['adcode'][city] except Exception,err: print >>sys.stderr,'adcode -> citycode error! (%s)'%(err) pass else: # 中文转城市编码,2014-6-17,增加市前先判断是否存在 if not city.endswith('市') and city not in city_dict['adname']: city += '市' try: city = city_dict['adname'][city] #city = city_dict['adname'][city.decode('utf8').encode('gbk')] except Exception,err: print >>sys.stderr,'adname -> citycode error! (%s,%s)'%(city,err) pass citycode = city
# user_loc:114.398544,30.501300 user_loc_list = position_dict['user_loc'].split(',') if len(user_loc_list) == 2: try: tmp_citycode = func.get_citycode(user_loc_list) except Exception,err: print >>sys.stderr,'Regeo ERROR ! (%s)'%(repr(user_loc_list)) tmp_citycode = [] if tmp_citycode: user_loc_city = tmp_citycode[0] position_dict['user_loc_city'] = user_loc_city # 手机相关信息 for i in cellphone_list: if i in cifa_dict: if i == 'div': # [2014-5-21] div 大写 cellphone_dict[i] = func.get_value(cifa_dict,i).upper() else: cellphone_dict[i] = func.get_value(cifa_dict,i) else: if i == 'div': cellphone_dict[i] = func.get_value(request_dict,i).upper() else: cellphone_dict[i] = func.get_value(request_dict,i) #2014-09-05 cifa_dict->request_dict # 2014-6-17 新增字段is_general标记是否泛需求 if 'keywords' in request_dict: if request_dict['keywords'] in general_dict: request_dict['is_general'] = '1' request_dict['cifa'] = func.dict2str(cifa_dict,';','=') for i in ('position','request','response','cellphone','other'): out_dict[i] = func.dict2str(eval("%s_dict"%(i)))
def main( input = sys.stdin , output = sys.stdout ): # input: diu date time path para # output: uid time position source action request response other; partition: dt # 359188049115769 2014-01-07 00:00:00 /ASS t=traf out_list = ['uid','sessionid','stepid','time','position','source','action','request','response','cellphone','other'] #format_str = ('date','time','method','path','para','code','size','os','resp_time') # 2014.6.13 只保留部分有用字段 format_str = ('date','time','path','para','os','resp_time') # 2014.6.13 修改正则,兼容分区字段dt,解决query-click点击量为0的问题 pattern_apache = re.compile(r"^.*?\s+.*?\s+.*?\[(.*?)\s+(.*?)\..*?\]\s+\".*?\s+(.*?)\?(.*?)\s+HTTP.*\"\s+.*?\s+.*?\s+\".*?\"\s+\"(.*?)\"\s+(.*?)\t.*?$",re.I) #pattern_apache = re.compile(r"^.*?\s+.*?\s+.*?\[(.*?)\s+(.*?)\..*?\]\s+\"(.*?)\s+(.*?)\?(.*?)\s+HTTP.*\"\s+(.*?)\s+(.*?)\s+\".*?\"\s+\"(.*?)\"\s+(.*?)$",re.I) pattern_uid = re.compile(r'^[\w-]+$',re.I) # 2014-5-16 cifa信息移动至cellphone #para_dict = {'uid':'diu','sessionid':'session','stepid':'stepid','position':['user_loc','geoobj'],'cellphone':['div'],'other':['diu2','diu3','tid']} #2014-09-02 add tid para_dict = {'uid':'diu','sessionid':'session','stepid':'stepid','cellphone':['div'],'other':['diu2','diu3','tid']} #2014-09-02 add tid position_key = ('user_loc','geoobj') # 加载adcode转citycode表 adcodeDict = func.adcode2citycode() # 加载aos url映射表 aos_dict = {} #aos_file = '../../../tool/aos.txt' aos_file = 'aos.txt' for line in file(aos_file): arr = [i.strip() for i in line.strip().split('\t')] key,value = arr[0:2] if len(arr) != 3: #20141112,garnett #print >>sys.stderr,'aos dict line error !(%s)'%(line) continue aos_dict[key.rstrip('/')] = value in_dict = {} for line in input: #line = line.replace("%0A","").replace("%0D","") p = pattern_apache.match(line) if not p: #20141112,garnett #func.counter('Count','line pattern miss',1) continue out_dict = {} in_dict = dict(zip(format_str,p.groups())) # 2014-09-05 del test data if in_dict['os'] == 'autonavi-ssl-scanner': continue #out_dict['time'] = in_dict['time'] out_dict['time'] = func.get_value(in_dict,'time') # 2014-6-11 para = urllib.unquote(func.get_value(in_dict,'para')) tmp_para = func.str2dict(para,'&','=') if 'diu' not in tmp_para or not pattern_uid.match(tmp_para['diu']) or tmp_para['diu'] == 'null' : # diu missed [2014-3-17] #20141112,garnett #print >>sys.stderr,'diu missed ! (%s)'%(repr(tmp_para)) #func.counter('Count','diu miss|match',1) continue #2014-09-02 解析searchhomepage中参数 for k in tmp_para.keys(): if k.startswith('shp_'): if k[4:] not in tmp_para: tmp_para[k[4:]] = tmp_para[k].strip() del tmp_para[k] else: tmpkey = k[4:] + "newest" tmp_para[tmpkey] = tmp_para[k].strip() tmp_para[k[4:]] = tmp_para[k].strip() del tmp_para[k] else: tmp_para[k] = tmp_para[k].strip() for k in para_dict: v = para_dict[k] if type(v) == type('str'): if v not in tmp_para: if v == 'stepid': # 2014-7-21 step -> stepid out_dict[k] = func.get_value(tmp_para,'step') elif v=='session': #2014-09-02 sessionid-> sessionid out_dict[k] = func.get_value(tmp_para,'sessionid') else: out_dict[k] = '-' else: # 2014-5-16 para参数取出 out_dict[k] = func.get_value(tmp_para,v) elif type(v) == type([]): out_dict[k] = {} for i in v: if i in tmp_para: out_dict[k][i] = func.get_value(tmp_para,i) else: #20141112,garnett #print >>sys.stderr,'Illegal key(%s) found !'%(k) #func.counter('Count','illegal key',1) continue # 获取位置信息 x, y = '-','-' for i in ('x','lon','longitude'): if i in tmp_para and tmp_para[i] not in ('','-'): x = func.get_value(tmp_para,i) break for i in ('y','lat','latitude'): if i in tmp_para and tmp_para[i] not in ('','-'): y = func.get_value(tmp_para,i) break # 2014-5-16,挪动cifa信息到other中,与sp保持一致 # cifa里的经纬度赋给position,manufacture,model等赋给cellphone cifa_str = func.get_value(tmp_para,'cifa') cifa_dict = func.str2dict(cifa_str,';','=') #2014-09-02 把字符串cifa_dict转化成dict if x == '-' and 'lon' in cifa_dict: lon = func.get_value(cifa_dict,'lon') try: x = str(float(lon)/10**6) except Exception,err: pass if y == '-' and 'lat' in cifa_dict: lat = func.get_value(cifa_dict,'lat') try: y = str(float(lat)/10**6) except Exception,err: pass
y = str(float(lat)/10**6) except Exception,err: pass out_dict['position'] = {} out_dict['position']['x'] = x out_dict['position']['y'] = y # user_loc:114.398544,30.501300 try: adcode = xy2ccode.xy2ccode(float(x),float(y)) except Exception: adcode = "-" out_dict['position']['citycode'] = adcodeDict.get('adcode').get(adcode,adcode) for i in ('device','model','manufacture'): out_dict['cellphone'][i] = func.get_value(cifa_dict,i) out_dict['cellphone']['cifa'] = cifa_str #2014-09-02 for k in position_key: if not tmp_para.has_key(k): #20141112,garnett #func.counter('Count','position key miss',1) continue if k == 'geoobj': out_dict['position'][k] = func.get_value(tmp_para,k).replace('|',';') else: out_dict['position'][k] = func.get_value(tmp_para,k) out_dict['source'] = "AOS" # source
# 2014-6-19 去掉sug测试日志 if 'query_src' in in_dict['req1'] and in_dict['req1'][ 'query_src'] == 'test': continue request_dict.update(in_dict['req2']) request_dict['tid'] = in_dict['tid'] request_dict['aos_verion'] = in_dict['sysinfo'][ 'aos_version'] if 'aos_version' in in_dict['sysinfo'] else '-' response_dict['spend_time'] = in_dict['sysinfo'][ 'spend_time'] if 'spend_time' in in_dict['sysinfo'] else '-' #print response_dict # uid out_dict['uid'] = '-' for i in ('user_info', 'diu'): if i in request_dict and request_dict[i] != '-': out_dict['uid'] = func.get_value( request_dict, i).upper() # 2014-6-21 sug的uid都转大写,同sp break if not pattern_uid.match(out_dict['uid']) and out_dict['uid'] != '-': func.counter('Count', 'uid illegal pass', 1) continue # sessionid,stepid for i in ('sessionid', 'stepid'): out_dict[i] = func.get_value(request_dict, i) # time if len(in_dict['time']) == 14: date = in_dict['time'][:8] time = in_dict['time'][8:10] + ':' + in_dict['time'][ 10:12] + ':' + in_dict['time'][12:] else: date, time = '-', '-' out_dict['time'] = time
def main(input=sys.stdin, output=sys.stdout): # 非法diu # 加载码表 #pb_dict = json.loads(open('../../page-button/json.txt','r').read()) pb_dict = json.loads(open('page-button.json', 'r').read()) pb_key_list = pb_dict.keys() # out ---> 【uid(用户标识,string) time(时间,string) position(地点,map) source(数据源,string) action(动作类别,string) request(请求信息,map) response(响应信息,map) other(其他信息,map)】 format_str = 'id||diu||div||aid||source||service||page||button||action||time||session||x||y||para||protocol_version||diu2||diu3||dic||model||device||manufacture||stepid' format_list = format_str.split('||') cellphone_list = ['div', 'model', 'device', 'manufacture'] out_dict = { 'uid': '-', 'sessionid': '-', 'stepid': '-', 'time': '-', 'position': '-', 'source': 'CLIENT', 'action': '-', 'request': '-', 'response': '-', 'cellphone': '-', 'other': '-' } out_list = [ 'uid', 'sessionid', 'stepid', 'time', 'position', 'source', 'action', 'request', 'response', 'cellphone', 'other' ] #para_seg = 'download_rate,ip,start_time,method,url,version,result,start_receive_time,data_size,end_time' # ip,start_receive_time不一定有 pattern_uid = re.compile(r'^[\w-]+$', re.I) #print format_list for line in input: position_dict = {} request_dict = {} response_dict = {} cellphone_dict = {} other_dict = {} arr = [i.strip() for i in line.strip().split('||')] # 2014-5-23 add dt dt = arr[-1].split('\t')[-1] arr[-1] = arr[-1].split('\t')[0] if len(arr) != len(format_list): print >> sys.stderr, 'line length error ! %s!=%s \n\t(%s)' % ( len(arr), len(format_list), line) func.counter('Count', 'line length error', 1) continue line_dict = dict(zip(format_list, arr)) out_dict['uid'] = func.get_value(line_dict, 'diu') if 'diu' in line_dict else '-' if not pattern_uid.match(out_dict['uid']): print >> sys.stderr, 'diu(%s) illegal ! pass ...' % ( out_dict['uid']) func.counter('Count', 'uid miss match', 1) continue if 'page' not in line_dict or 'button' not in line_dict: func.counter('Count', 'page button miss', 1) print >> sys.stderr, 'miss page or button ! pass ...(%s)' % (line) continue page = func.get_value(line_dict, 'page') button = func.get_value(line_dict, 'button') out_dict['sessionid'] = func.get_value( line_dict, 'session') if 'session' in line_dict else '-' if 'step' in line_dict: out_dict['stepid'] = func.get_value(line_dict, 'step') elif 'stepid' in line_dict: out_dict['stepid'] = func.get_value(line_dict, 'stepid') else: out_dict['stepid'] = '-' other_dict['diu2'] = func.get_value(line_dict, 'diu2') other_dict['diu3'] = func.get_value(line_dict, 'diu3') position_dict['x'] = func.get_value(line_dict, 'x') position_dict['y'] = func.get_value(line_dict, 'y') explain = '-' # 动作解释 devi = '-' # 根据ver(div)区分系统,IOSH060100,ANDH060000 --[新] div = line_dict['div'].upper() # os类别 if len(div) < 10: print >> sys.stderr, 'div error ! div=(%s) pass' % (div) func.counter('Count', 'div error', 1) continue ver = div[-5:] # div对应地图不同硬件版本:IOS(H,P),WIN(H,P),ANDH,BLBH if div.startswith('IOS'): os = 'ios' elif div.startswith('AND'): os = 'android' else: pass line_dict['os'] = os explain = '-' out_dict['action'] = 'page=%s|button=%s' % (page, button) other_dict['action_name'] = '-' try: # 不同app版本的时间戳解析方法不同. [2014-3-12]garnett反馈,客户端时间解析存在8h误差,经瑞娟确认,修改解析方法 if ver < '60200': t_list = time.strftime( '%Y-%m-%d %H:%M:%S', time.gmtime( int(line_dict['time']) + 1293811200 + 8 * 3600)).split() # 2014-5-13 620以下的版本直接查620码表 new_div = div[0:4] + '060200' else: t_list = time.strftime( '%Y-%m-%d %H:%M:%S', time.gmtime( int(line_dict['time']) / 1000 + 1293811200 + 8 * 3600)).split() # 2014-5-13 620以上的数据才查码表 (不同版本码表不同) if div in pb_key_list: new_div = div else: # 中间版本,往上一个二位版本聚合 new_div = div[:-2] + '00' except Exception: print >> sys.stderr, 'time.strtime error ! (%s)' % ( line_dict['time']) t_list = ['-', '-'] try: explain = pb_dict[new_div][page][button]['explain'] except Exception, err: print >> sys.stderr, '码表查找失败!div=%s,page=%s,button=%s' % ( div, page, button) if explain in ('网页日志 ', '联网日志', '网络事件->网络事件'): # android:1000,0 2000,0 IOS:2000,0 func.counter('Count', 'pass action', 1) continue other_dict['action_name'] = explain out_dict['time'] = t_list[1] other_dict['date'] = t_list[0].replace( '-', '') # [2014-3-17] 2014-03-17 --> 20140317 if line_dict['para'] != '': try: #para_dict = json.loads(arr[13]) para_dict = json.loads(func.get_value(line_dict, 'para')) except Exception, err: #print >>sys.stderr,'[error] json data:para=[%s]'%(arr[13]) func.counter('Count', 'para miss', 1) continue
other_dict['date'] = t_list[0].replace( '-', '') # [2014-3-17] 2014-03-17 --> 20140317 if line_dict['para'] != '': try: #para_dict = json.loads(arr[13]) para_dict = json.loads(func.get_value(line_dict, 'para')) except Exception, err: #print >>sys.stderr,'[error] json data:para=[%s]'%(arr[13]) func.counter('Count', 'para miss', 1) continue else: #print >>sys.stderr,'para empty!(%s)'%(line) para_dict = {} pass # 手机相关信息 for i in cellphone_list: cellphone_dict[i] = func.get_value(line_dict, i) # 请求信息 request_dict = line_dict out_dict['position'] = func.dict2str(position_dict) out_dict['request'] = func.dict2str(request_dict) out_dict['response'] = func.dict2str(para_dict) out_dict['cellphone'] = func.dict2str(cellphone_dict) out_dict['other'] = func.dict2str(other_dict) print >> output, '\t'.join([out_dict[i] for i in out_list]) #print json.dumps(tmp_dict,ensure_ascii=False,encoding='utf-8',indent=4) if __name__ == '__main__': main()