Exemplo n.º 1
0
def parse(out_list=[]):
    # 输出结果
    # [uid,sessionid,stepid,time,query,result,click,request,response,position,cellphone]
    arr_len = len(out_list)
    if len(out_list) < 6:
        return ''
    if 'poi_ids_list' in out_list[5]:
        del out_list[5]['poi_ids_list']
    for i in xrange(arr_len):
        tmp_type = type(out_list[i])
        if tmp_type == type({}):
            out_list[i] = func.dict2str(out_list[i])
        elif tmp_type == type([]):
            out_list[i] = '|'.join(out_list[i])
        elif tmp_type == type(100):
            out_list[i] = str(out_list[i])
        else:
            pass
    return '\t'.join(out_list)
Exemplo n.º 2
0
def main(input=sys.stdin, output=sys.stdout):
    format_list = [
        'uid', 'sessionid', 'stepid', 'time', 'position', 'source', 'action',
        'request', 'response', 'cellphone', 'other', 'dt'
    ]
    out_dict = {
        'uid': '-',
        'sessionid': '-',
        'stepid': '-',
        'time': '-',
        'position': '-',
        'query': '-',
        'result': '-',
        'click': '-',
        'cellphone': '-'
    }
    out_list = [
        'uid', 'sessionid', 'stepid', 'time', 'position', 'query', 'result',
        'click', 'cellphone'
    ]
    pattern_uid = re.compile(r'^[\w-]+$', re.I)
    last_key = '-'
    session_list = []

    for line in input:
        # in: [uid sessionid stepid time position source action request response cellphone other]
        query_dict = {}
        result_dict = {}
        click_dict = {}

        arr = [i.strip() for i in line.strip().split('\t')]
        if len(arr) != len(format_list):
            #print >>sys.stderr,'line length error ! %s!=%s \nline=(%s)'%(len(arr),len(format_list),line)
            continue
        line_dict = dict(zip(format_list, arr))
        for i in ('position', 'request', 'response', 'cellphone', 'other'):
            line_dict[i] = json.loads(line_dict[i])
        uid = line_dict['uid']
        sessionid = line_dict['sessionid']
        # 获取stepid
        try:
            stepid = int(line_dict['stepid'])
        except Exception, err:
            stepid = -1
        # 获取time
        time = line_dict['time']
        source = line_dict['source']
        # 获取query_type
        query_dict['query_type'] = line_dict['action']
        # 获取citycode
        #citycode = line_dict['position']['citycode'] if 'citycode' in line_dict['position'] else '-'
        if source == 'SP':
            # 提取信息: query(keywords,category),count,poi_ids
            # 获取检索query(keywords,category)
            for i in ('keywords', 'category'):  # keywords,category可能都存在
                if i in line_dict['request']:
                    query_dict[i] = func.get_value(line_dict['request'], i)
            if len(query_dict) < 2:
                # keywords,category均不存在时,跳过这条记录
                continue
            # 获取检索结果列表
            for i in ('count', 'poi_ids'):
                result_dict[i] = func.get_value(
                    line_dict['response'],
                    i) if i in line_dict['response'] else '-'
            # 2014-6-13 修复bug:无结果时,点击位置为1,因为poi_ids_list=['-'],,poiid='-'时误打误撞匹配上
            if result_dict['poi_ids'] == '-':
                poi_ids_list = []
            else:
                poi_ids_list = result_dict['poi_ids'].split('&')
            result_dict['poi_ids_list'] = poi_ids_list
            # 冗余信息
            request_str = func.dict2str(line_dict['request'])
            response_str = func.dict2str(line_dict['response'])
            position_str = func.dict2str(line_dict['position'])
            cellphone_str = func.dict2str(line_dict['cellphone'])

        elif source == 'AOS':
            # 提取信息: poiid
            if not 'poiid' in line_dict['request']:  # poiid不存在时,直接跳过
                continue
            poiid = func.get_value(line_dict['request'], 'poiid')
            click_dict['poiid'] = poiid
            # 冗余信息,补充到request中
            click_dict['time'] = time
            click_dict['stepid'] = stepid

        else:  # 异常类别,直接过滤
            continue
            pass
        uid_pass = 0
        len_sp = len(session_list)
        # 过滤异常用户 freq > 1w ,防止数据倾斜
        if len_sp > 10000:
            session_list = []
            uid_pass = 1
        # 输出:[uid,sessionid,stepid,time,query,result,click,request,response,position,cellphone]
        # uid+sessionid 作为key
        cur_key = uid + '\t' + sessionid
        if cur_key == last_key:
            if line_dict['source'] == 'SP':
                if uid_pass:
                    continue
                # key相同,追加记录到query-list中
                session_list.append([
                    uid, sessionid, stepid, time, query_dict, result_dict, {
                        'num': 0
                    }, request_str, response_str, position_str, cellphone_str
                ])
            elif line_dict['source'] == 'AOS':
                # 根据stepid,time降序排序
                #session_list.sort(key=lambda x:(x[2],x[3]),reverse=True)
                # 拿poiid去query-list查找展现信息: stepid=-1,去整个list中查;stepid!=-1,去前几步查
                #for i,item in enumerate(session_list[::-1]): # 2014-6-4 修复bug,点击poi不再展现list中,占比27%(25w/93w)
                for j in xrange(len_sp):  # session_list按照stepid倒序查找
                    i = -(j + 1)
                    item = session_list[i]
                    if stepid != -1 and stepid <= item[2]:
                        # stepid有效时,需要限制查找范围
                        continue
                    tmp_poi_list = item[5]['poi_ids_list']
                    if poiid in tmp_poi_list:
                        pos = tmp_poi_list.index(poiid) + 1
                        # 多点击、重复点击情形处理 2014-6-18
                        order = session_list[i][6]['num'] + 1
                        session_list[i][6]['num'] = order
                        # 记录点击信息(含多次点击):[order stepid time poiid pos] &&分隔展现项,||分隔展现元素
                        if order <= 1:
                            # 首次点击
                            session_list[i][6]['click_list'] = '|'.join([
                                str(order),
                                str(stepid), time, poiid,
                                str(pos)
                            ])
                        else:
                            # 非首次点击
                            session_list[i][6]['click_list'] += '&' + '|'.join(
                                [
                                    str(order),
                                    str(stepid), time, poiid,
                                    str(pos)
                                ])
                        #session_list[i][6]['poiid'] = poiid
                        #session_list[i][6]['pos'] = str(pos)
                        break  # 匹配成功后,停止查找---一个点击匹配最多一次检索行为
        else:  # key变化,清空当前query-list,初始化
            #session_list.sort(key=lambda x:(x[2],x[3]),reverse=True)
            for item in session_list:
                print parse(item)
                uid_pass = 0
            session_list = []  # 2014-6-23 bug修复,原来置空仅限source=sp
            if source == 'SP':
                session_list.append([
                    uid, sessionid, stepid, time, query_dict, result_dict, {
                        'num': 0
                    }, request_str, response_str, position_str, cellphone_str
                ])
                last_key = cur_key
            elif source == 'AOS':
                # AOS有点击,但SP无记录
                print >> sys.stderr, 'only in AOS (%s)' % (cur_key)
                pass
            else:
                continue
                pass
Exemplo n.º 3
0
        position_dict['user_loc_city'] = user_loc_city
        # 手机相关信息
        for i in cellphone_list:
            if i in cifa_dict:
                if i == 'div':  # [2014-5-21] div 大写
                    cellphone_dict[i] = func.get_value(cifa_dict,i).upper()
                else:
                    cellphone_dict[i] = func.get_value(cifa_dict,i)
            else:
                if i == 'div':
                    cellphone_dict[i] = func.get_value(request_dict,i).upper()
                else:
                    cellphone_dict[i] = func.get_value(request_dict,i)  #2014-09-05  cifa_dict->request_dict
        # 2014-6-17  新增字段is_general标记是否泛需求
        if 'keywords' in request_dict:
            if request_dict['keywords'] in general_dict:
                request_dict['is_general'] = '1'

        request_dict['cifa'] = func.dict2str(cifa_dict,';','=')
        for i in ('position','request','response','cellphone','other'):
            out_dict[i] = func.dict2str(eval("%s_dict"%(i)))
        try:
            print >>output,'\t'.join([out_dict[i] for i in out_list])
        except Exception,err:
            print >>sys.stderr,'out_dict print error! (%s)'%(repr(out_dict))
            pass
    #print json.dumps(tmp_dict,ensure_ascii=False,encoding='utf-8',indent=4)

if __name__ == '__main__':
    main()
Exemplo n.º 4
0
        #out_dict['action'] = in_dict['path'].rstrip('/') # action,去掉右侧/
        out_dict['request'] = tmp_para # request
        out_dict['request'].update(in_dict) # 2014-6-13 6.11增加冗余字段失效,导致aos格式偏移,原因是正则错误,包含了tab和分区字段
        #out_dict['other'] = {} # info  #2014-09-02 不要重置字典,会把已有内容清空
        # [2014-5-16] 将中文解释添加进去
        out_dict['other']['action_name'] = action_name
        #out_dict['other']['diu2'] = func.get_value(tmp_para,'diu2') #2014-09-02前面已取ine68-83
        #out_dict['other']['diu3'] = func.get_value(tmp_para,'diu3') #2014-09-02前面已取ine68-83
        out_dict['response'] = {} # response,aos不记录返回信息
        # pack result:
        out_str = '-'
        for i in out_list:
            if i not in out_dict:
                #20141112,garnett
                #print >>sys.stderr,'key(%s) not in out_dict!'%(i)
                #func.counter('Count','out_dict miss',1)
                continue
            v = out_dict[i]
            if type(v) == type({}):
                if out_str == '-':
                    out_str = func.dict2str(v)
                else:
                    out_str += '\t' + func.dict2str(v)
            else:
                out_str += '\t' + v
        print >>output,out_str.lstrip('-\t')

if __name__ == '__main__':
    main()

Exemplo n.º 5
0
        # sessionid,stepid
        for i in ('sessionid', 'stepid'):
            out_dict[i] = func.get_value(request_dict, i)
        # time
        if len(in_dict['time']) == 14:
            date = in_dict['time'][:8]
            time = in_dict['time'][8:10] + ':' + in_dict['time'][
                10:12] + ':' + in_dict['time'][12:]
        else:
            date, time = '-', '-'
        out_dict['time'] = time
        other_dict['date'] = date
        # position
        for i in ('geoobj', 'user_loc', 'user_city'):
            if i == 'geoobj':  # [2014-5-21]geoobj分隔符替换: | --> ;
                position_dict[i] = func.get_value(request_dict,
                                                  i).replace('|', ';')
            else:
                position_dict[i] = func.get_value(request_dict, i)
        out_dict['action'] = func.get_value(request_dict, 'query_type')
        # cellphone
        for i in ('div', 'platform'):
            if i == 'div':  # [2014-5-21] div大写
                cellphone_dict[i] = func.get_value(request_dict, i).upper()
            else:
                cellphone_dict[i] = func.get_value(request_dict, i)

        for i in ('position', 'request', 'response', 'cellphone', 'other'):
            out_dict[i] = func.dict2str(eval("%s_dict" % (i)))
        print '\t'.join([out_dict[i] for i in out_list])
Exemplo n.º 6
0
        other_dict['date'] = t_list[0].replace(
            '-', '')  # [2014-3-17] 2014-03-17 --> 20140317
        if line_dict['para'] != '':
            try:
                #para_dict = json.loads(arr[13])
                para_dict = json.loads(func.get_value(line_dict, 'para'))
            except Exception, err:
                #print >>sys.stderr,'[error] json data:para=[%s]'%(arr[13])
                func.counter('Count', 'para miss', 1)
                continue
        else:
            #print >>sys.stderr,'para empty!(%s)'%(line)
            para_dict = {}
            pass
        # 手机相关信息
        for i in cellphone_list:
            cellphone_dict[i] = func.get_value(line_dict, i)
        # 请求信息
        request_dict = line_dict
        out_dict['position'] = func.dict2str(position_dict)
        out_dict['request'] = func.dict2str(request_dict)
        out_dict['response'] = func.dict2str(para_dict)
        out_dict['cellphone'] = func.dict2str(cellphone_dict)
        out_dict['other'] = func.dict2str(other_dict)
        print >> output, '\t'.join([out_dict[i] for i in out_list])
    #print json.dumps(tmp_dict,ensure_ascii=False,encoding='utf-8',indent=4)


if __name__ == '__main__':
    main()