Exemplo n.º 1
0
def mapper():
    seg_path = './dict_seg_wordrank/worddict/'
    tag_path = './dict_seg_wordrank/tagdict/'
    ner_path = './dict_seg_wordrank/wordner.conf'
    stop_words_path = "./dict_seg_wordrank/stop_words.txt"
    seg_handle = wordrank_ecom.Seg_Rank();
    seg_handle.seg_dict_init(seg_path, tag_path, ner_path, stop_words_path);

    for line in sys.stdin:
        if not line:
            continue
        line = line.strip()
        data = line.split("\t")

        length = len(data)
        if length == 3:
            tag = data[1]
            if tag == "0":    #word vector dict
                term = process_str.handle_str(data[0]).strip()
                if len(term) == 0:
                    continue
                vec = data[2].strip().split(' ');
                sum = 0
                for ele in vec:
                    sum = sum + float(ele) * float(ele);
                sum = math.sqrt(sum);
                vec = [str(float(x)/sum) for x in vec];
                print "%s\t0\t%s" %(term, " ".join(vec))
            elif tag == "1":  #url vector dict
                print "%s\t0\t%s" %(data[0], data[2])
            else:
                continue
        else:
            if data[1] != "2":
                continue
            winfoid = data[0];
            bidword = data[2];

            for idx in xrange(3, len(data)):
                target_url = data[idx]
                print "%s\t1\turl_info\t%s" %(target_url, winfoid)

            if len(bidword) == 0:
                continue
            try:
                (ret, token, netoken) = seg_handle.get_seg(bidword);
                if ret < 0:
                    continue

                for term, postag in token:
                    term = term.strip()
                    if len(term) == 0:
                        continue
                    print "%s\t1\tterm_info\t%s" %(term, winfoid)
            except:
                continue
Exemplo n.º 2
0
def mapper(fea_common_conf_filename):
    '''@desc mapper阶段 对不同资源打上不同tag方便拼接 '''
    user_trade = read_user_trade()
    parse_shitu_log_obj = parse_shitu_log(fea_common_conf_filename)
    parse_shitu_log_obj.initialize()

    ps_dict = {}
    bd_trade = {}

    for line in sys.stdin:
        if not line:
            continue
        line = line.strip()
        data = line.split("\t")

        if len(line) == 0 or len(data) < 1:
            continue

        part = data[0].split(",")
        if len(part) == 3 and (data[1] == "pc" or data[1] == "wise"):    # log
            key = urllib.unquote(data[parse_shitu_log.join_key_index].strip()).replace("\t", " ")
            key = key.replace("\n", " ")
            key = process_str.handle_str(key)

            if len(key) == 0:
                continue

            userid = data[parse_shitu_log.userid_index]
            trade = user_trade.get(userid, 0)
            print "%s\t3\t%s\t%s" %(key, line, trade)   #打上tag 3

        elif len(data) == 3:   #bidword trade
            key = process_str.handle_str(data[1])
            if len(key) == 0:
                continue
            if bd_trade.has_key(key):
                continue
            bd_trade[key] = 1
            print "%s\t0\t%s" %(key, data[2])           #打上tag 0
        elif data[1] == "300":   #plsa_pzd 300
            try:
                key = process_str.handle_str(data[0])
            except:
                print >>sys.stderr, data[0]
                continue

            if len(key) == 0:
                continue
            value = "\N"
            if data[4] == "0":   #data[4] = 0 表示plsa向量为空
                print "%s\t1\t%s" %(key, value)
            else:
                idx = 5
                to_print = []
                length = len(data)
                while idx < length and (idx + 1) < length:
                    value = "%s:%s" %(data[idx], data[idx + 1])
                    to_print.append(value)
                    idx += 2
                print "%s\t1\t%s" %(key, ",".join(to_print))   #打上tag 1
        else:                     #ps term   
            key = process_str.handle_str(data[0])
            if len(key) == 0:
                continue
            if ps_dict.has_key(key):
                continue
            value = "\N"
            if int(data[2]) == 0:
                print "%s\t2\t%s" %(key, value)
            else:
                idx = 3
                to_print = []
                length = len(data)
                while idx < length and (idx + 1) < length:
                    value = "%s:%s" %(data[idx], data[idx + 1])
                    to_print.append(value)
                    idx += 2
                print "%s\t2\t%s" %(key, ",".join(to_print))    #打上tag 2
            ps_dict[key] = 1
Exemplo n.º 3
0
def pretreatment(model_type, fea_common_conf_filename):
    '''@desc 对shitu log 进行预处理(抽取字段,过滤等) '''
    white_dict = read_white_list()
    parse_shitu_log_obj = parse_shitu_log(fea_common_conf_filename)
    parse_shitu_log_obj.initialize() 
    
    log_pc_show = 0
    log_pc_click = 0
    log_pc_price = 0
    log_wise_show = 0
    log_wise_click = 0
    log_wise_price = 0

    for line in sys.stdin:
        if not line:
            continue
        line = line.strip()
        data = line.split("\t")

        if line == "" or len(data) == 0:
            continue

        wise_cmatch = ["222", "223", "228", "229"]
        pc_cmatch = ["201", "204", "225"]

        if model_type == "predict":     # tag:predict 表示对shitu log 进行预估 
            bidword = urllib.unquote(data[23]).replace("\t", " ")   #解码
            bidword = process_str.handle_str(bidword).strip()
            if len(bidword) == 0:
                continue
            rec_field = []
            cmatch = data[9]
            wmatch = data[10]
            winfoid = data[11]
            key = data[6] + "," + data[9] + "," + data[12]  #searchid + cmatch + rank
            rec_field.append(winfoid)
            rec_field.append("record_field")
            rec_field.append(cmatch)
            rec_field.append(wmatch)
            rec_field.append(bidword)
            rec_field.append(data[0])    #show
            rec_field.append(data[1])    #click
            rec_field.append(data[2])    #price
            if cmatch in wise_cmatch:    #wise log
                showurl = urllib.unquote(data[50]).strip()
                log_wise_show += int(data[0])
                log_wise_click += int(data[1])
                log_wise_price += int(data[2])
                target_url = urllib.unquote(data[92]).strip();
                rec_field.append(target_url)
                print "\t".join(rec_field)
            elif cmatch in pc_cmatch:    #pc log
                showurl = urllib.unquote(data[51]).strip()
                log_pc_show += int(data[0])
                log_pc_click += int(data[1])
                log_pc_price += int(data[2])
                target_url = urllib.unquote(data[73]).strip();
                rec_field.append(target_url)
                print "\t".join(rec_field)
            else:
                rec_field.append("NULL")
                print "\t".join(rec_field)
                continue
        else:                           # 表示训练和测试过程, 训练和测试数据做了一些处理
            bidword = urllib.unquote(data[24]).replace("\t", " ")
            bidword = process_str.handle_str(bidword).strip()
            if len(bidword) == 0:
                continue
            cmatch = data[10]
            wmatch = data[11]
            winfoid = data[12]
            key = data[7] + "," + data[10] + "," + data[13]  #searchid + cmatch + rank
            rec_field = []
            rec_field.append(winfoid) 
            rec_field.append("record_field")
            rec_field.append(cmatch)
            rec_field.append(wmatch)
            rec_field.append(bidword)
            rec_field.append(data[1])    #show
            rec_field.append(data[2])    #click
            rec_field.append(data[3])    #price
            if cmatch in wise_cmatch:    #wise log
                showurl = urllib.unquote(data[51]).strip()
                target_url = urllib.unquote(data[93]).strip();
                rec_field.append(target_url)
                print "%s#C" %("\t".join(rec_field))
            elif cmatch in pc_cmatch:    #pc log
                showurl = urllib.unquote(data[52]).strip()
                target_url = urllib.unquote(data[74]).strip();
                rec_field.append(target_url)
                print "%s#C" %("\t".join(rec_field))
            else:
                rec_field.append("NULL")
                print "%s#C" %("\t".join(rec_field))
                continue
            
        
        # 一些白名单过滤,词表格式(bidword \t showurl)
        if model_type == "predict" and white_dict.has_key(bidword):
            part = white_dict[bidword].split(",")
            #proto, rest = urllib.splittype(showurl)
            #host, rest = urllib.splithost(rest)
            if showurl in part:
                continue
        
        '''
        # 只对精确匹配的广告进行预估
        if model_type == "predict" and wmatch != "63": 
            continue
        '''
        
        if cmatch in wise_cmatch:
            source = "wise"
        elif cmatch in pc_cmatch:
            source = "pc"
        else:
            continue
        p_value = 0
        label = 0

        # winfo + desc1 为key对log 进行去重
        if model_type == "predict":
            winfo = data[11]
            desc1 = data[25]
            nline = "%s\t%s\t%s\t%s\t%s" %(key, source, p_value, label, line)
            fea_value = parse_shitu_log_obj.parse_field(nline, source)
            if fea_value == "NULL" or fea_value == "":
                continue

            merge_key = winfo + "," + wmatch + "," + source + "\t" + desc1
            print "%s\t%s" %(merge_key, fea_value)
        else:
            winfo = data[12]
            nline = "%s\t%s\t%s\t%s" %(key, source, p_value, line)
            fea_value = parse_shitu_log_obj.parse_field(nline, source)
            if fea_value == "NULL" or fea_value == "":
                continue
            print "%s#A" %(fea_value)
    
    if model_type == "predict":
        print "tongji\tpc_log\t%s\t%s\t%s" %(log_pc_show, log_pc_click, log_pc_price)
        print "tongji\twise_log\t%s\t%s\t%s" %(log_wise_show, log_wise_click, log_wise_price)
seg_handle.seg_dict_init(seg_path, tag_path, ner_path, stop_words_path);

# input:
#   1. term \t tag(0) \t vector
#   2. target_url \t query \t click \t show \t ctr \t price
# output:
#   1. term \t tag(1) \t vector
#   2. term \t tag(2) \t target_url \t click
for line in sys.stdin:
    flds = line.strip().split("\t");
    flds_size = len(flds);
    if flds_size != 6 and flds_size != 3:
        continue;

    if flds_size == 3:  # word vector dict
        word = process_str.handle_str(flds[0]).strip()
        if len(word) == 0:
            continue
        
        length = 0;
        vec = flds[2].strip().split(' ');
        for ele in vec:
            length = length + float(ele) * float(ele);
        length = math.sqrt(length);
        vec = [str(float(x)/length) for x in vec];
        print "%s\t1\t%s" % (word, " ".join(vec));

    elif flds_size == 6:
        target_url = flds[0];
        if len(target_url.split("\t")) != 1 or target_url.isdigit():
            continue;