def mapper(): seg_path = './dict_seg_wordrank/worddict/' tag_path = './dict_seg_wordrank/tagdict/' ner_path = './dict_seg_wordrank/wordner.conf' stop_words_path = "./dict_seg_wordrank/stop_words.txt" seg_handle = wordrank_ecom.Seg_Rank(); seg_handle.seg_dict_init(seg_path, tag_path, ner_path, stop_words_path); for line in sys.stdin: if not line: continue line = line.strip() data = line.split("\t") length = len(data) if length == 3: tag = data[1] if tag == "0": #word vector dict term = process_str.handle_str(data[0]).strip() if len(term) == 0: continue vec = data[2].strip().split(' '); sum = 0 for ele in vec: sum = sum + float(ele) * float(ele); sum = math.sqrt(sum); vec = [str(float(x)/sum) for x in vec]; print "%s\t0\t%s" %(term, " ".join(vec)) elif tag == "1": #url vector dict print "%s\t0\t%s" %(data[0], data[2]) else: continue else: if data[1] != "2": continue winfoid = data[0]; bidword = data[2]; for idx in xrange(3, len(data)): target_url = data[idx] print "%s\t1\turl_info\t%s" %(target_url, winfoid) if len(bidword) == 0: continue try: (ret, token, netoken) = seg_handle.get_seg(bidword); if ret < 0: continue for term, postag in token: term = term.strip() if len(term) == 0: continue print "%s\t1\tterm_info\t%s" %(term, winfoid) except: continue
def mapper(fea_common_conf_filename): '''@desc mapper阶段 对不同资源打上不同tag方便拼接 ''' user_trade = read_user_trade() parse_shitu_log_obj = parse_shitu_log(fea_common_conf_filename) parse_shitu_log_obj.initialize() ps_dict = {} bd_trade = {} for line in sys.stdin: if not line: continue line = line.strip() data = line.split("\t") if len(line) == 0 or len(data) < 1: continue part = data[0].split(",") if len(part) == 3 and (data[1] == "pc" or data[1] == "wise"): # log key = urllib.unquote(data[parse_shitu_log.join_key_index].strip()).replace("\t", " ") key = key.replace("\n", " ") key = process_str.handle_str(key) if len(key) == 0: continue userid = data[parse_shitu_log.userid_index] trade = user_trade.get(userid, 0) print "%s\t3\t%s\t%s" %(key, line, trade) #打上tag 3 elif len(data) == 3: #bidword trade key = process_str.handle_str(data[1]) if len(key) == 0: continue if bd_trade.has_key(key): continue bd_trade[key] = 1 print "%s\t0\t%s" %(key, data[2]) #打上tag 0 elif data[1] == "300": #plsa_pzd 300 try: key = process_str.handle_str(data[0]) except: print >>sys.stderr, data[0] continue if len(key) == 0: continue value = "\N" if data[4] == "0": #data[4] = 0 表示plsa向量为空 print "%s\t1\t%s" %(key, value) else: idx = 5 to_print = [] length = len(data) while idx < length and (idx + 1) < length: value = "%s:%s" %(data[idx], data[idx + 1]) to_print.append(value) idx += 2 print "%s\t1\t%s" %(key, ",".join(to_print)) #打上tag 1 else: #ps term key = process_str.handle_str(data[0]) if len(key) == 0: continue if ps_dict.has_key(key): continue value = "\N" if int(data[2]) == 0: print "%s\t2\t%s" %(key, value) else: idx = 3 to_print = [] length = len(data) while idx < length and (idx + 1) < length: value = "%s:%s" %(data[idx], data[idx + 1]) to_print.append(value) idx += 2 print "%s\t2\t%s" %(key, ",".join(to_print)) #打上tag 2 ps_dict[key] = 1
def pretreatment(model_type, fea_common_conf_filename): '''@desc 对shitu log 进行预处理(抽取字段,过滤等) ''' white_dict = read_white_list() parse_shitu_log_obj = parse_shitu_log(fea_common_conf_filename) parse_shitu_log_obj.initialize() log_pc_show = 0 log_pc_click = 0 log_pc_price = 0 log_wise_show = 0 log_wise_click = 0 log_wise_price = 0 for line in sys.stdin: if not line: continue line = line.strip() data = line.split("\t") if line == "" or len(data) == 0: continue wise_cmatch = ["222", "223", "228", "229"] pc_cmatch = ["201", "204", "225"] if model_type == "predict": # tag:predict 表示对shitu log 进行预估 bidword = urllib.unquote(data[23]).replace("\t", " ") #解码 bidword = process_str.handle_str(bidword).strip() if len(bidword) == 0: continue rec_field = [] cmatch = data[9] wmatch = data[10] winfoid = data[11] key = data[6] + "," + data[9] + "," + data[12] #searchid + cmatch + rank rec_field.append(winfoid) rec_field.append("record_field") rec_field.append(cmatch) rec_field.append(wmatch) rec_field.append(bidword) rec_field.append(data[0]) #show rec_field.append(data[1]) #click rec_field.append(data[2]) #price if cmatch in wise_cmatch: #wise log showurl = urllib.unquote(data[50]).strip() log_wise_show += int(data[0]) log_wise_click += int(data[1]) log_wise_price += int(data[2]) target_url = urllib.unquote(data[92]).strip(); rec_field.append(target_url) print "\t".join(rec_field) elif cmatch in pc_cmatch: #pc log showurl = urllib.unquote(data[51]).strip() log_pc_show += int(data[0]) log_pc_click += int(data[1]) log_pc_price += int(data[2]) target_url = urllib.unquote(data[73]).strip(); rec_field.append(target_url) print "\t".join(rec_field) else: rec_field.append("NULL") print "\t".join(rec_field) continue else: # 表示训练和测试过程, 训练和测试数据做了一些处理 bidword = urllib.unquote(data[24]).replace("\t", " ") bidword = process_str.handle_str(bidword).strip() if len(bidword) == 0: continue cmatch = data[10] wmatch = data[11] winfoid = data[12] key = data[7] + "," + data[10] + "," + data[13] #searchid + cmatch + rank rec_field = [] rec_field.append(winfoid) rec_field.append("record_field") rec_field.append(cmatch) rec_field.append(wmatch) rec_field.append(bidword) rec_field.append(data[1]) #show rec_field.append(data[2]) #click rec_field.append(data[3]) #price if cmatch in wise_cmatch: #wise log showurl = urllib.unquote(data[51]).strip() target_url = urllib.unquote(data[93]).strip(); rec_field.append(target_url) print "%s#C" %("\t".join(rec_field)) elif cmatch in pc_cmatch: #pc log showurl = urllib.unquote(data[52]).strip() target_url = urllib.unquote(data[74]).strip(); rec_field.append(target_url) print "%s#C" %("\t".join(rec_field)) else: rec_field.append("NULL") print "%s#C" %("\t".join(rec_field)) continue # 一些白名单过滤,词表格式(bidword \t showurl) if model_type == "predict" and white_dict.has_key(bidword): part = white_dict[bidword].split(",") #proto, rest = urllib.splittype(showurl) #host, rest = urllib.splithost(rest) if showurl in part: continue ''' # 只对精确匹配的广告进行预估 if model_type == "predict" and wmatch != "63": continue ''' if cmatch in wise_cmatch: source = "wise" elif cmatch in pc_cmatch: source = "pc" else: continue p_value = 0 label = 0 # winfo + desc1 为key对log 进行去重 if model_type == "predict": winfo = data[11] desc1 = data[25] nline = "%s\t%s\t%s\t%s\t%s" %(key, source, p_value, label, line) fea_value = parse_shitu_log_obj.parse_field(nline, source) if fea_value == "NULL" or fea_value == "": continue merge_key = winfo + "," + wmatch + "," + source + "\t" + desc1 print "%s\t%s" %(merge_key, fea_value) else: winfo = data[12] nline = "%s\t%s\t%s\t%s" %(key, source, p_value, line) fea_value = parse_shitu_log_obj.parse_field(nline, source) if fea_value == "NULL" or fea_value == "": continue print "%s#A" %(fea_value) if model_type == "predict": print "tongji\tpc_log\t%s\t%s\t%s" %(log_pc_show, log_pc_click, log_pc_price) print "tongji\twise_log\t%s\t%s\t%s" %(log_wise_show, log_wise_click, log_wise_price)
seg_handle.seg_dict_init(seg_path, tag_path, ner_path, stop_words_path); # input: # 1. term \t tag(0) \t vector # 2. target_url \t query \t click \t show \t ctr \t price # output: # 1. term \t tag(1) \t vector # 2. term \t tag(2) \t target_url \t click for line in sys.stdin: flds = line.strip().split("\t"); flds_size = len(flds); if flds_size != 6 and flds_size != 3: continue; if flds_size == 3: # word vector dict word = process_str.handle_str(flds[0]).strip() if len(word) == 0: continue length = 0; vec = flds[2].strip().split(' '); for ele in vec: length = length + float(ele) * float(ele); length = math.sqrt(length); vec = [str(float(x)/length) for x in vec]; print "%s\t1\t%s" % (word, " ".join(vec)); elif flds_size == 6: target_url = flds[0]; if len(target_url.split("\t")) != 1 or target_url.isdigit(): continue;