def main(): train_iter = next_row(read_tsv("data/stream_%s.tsv"%args.sz)) test_iter = iter([]) sinfo_iter = read_tsv("data/sinfo_%s.tsv"%args.sz) del_keys_set = ["HistCTR", "SearchID", "ObjectType"] for t, (data_type, rows, sinfo) in enumerate(data(train_iter=train_iter, test_iter=test_iter, sinfo_iter=sinfo_iter)): uid = int(sinfo["UserID"]) date_str = sinfo["SearchDate"] ts = convert_ts(datetime.strptime(date_str, "%Y-%m-%d %H:%M:%S.0")) rows = filter(lambda x : int(x["ObjectType"]) == 3, rows) for row in rows: for key in del_keys_set: del row[key] for key in row: row[key] = int(row[key]) if row[key] != "" else 0 item = ( ts, int(sinfo["SearchID"]), tuple([(row["AdID"], row["IsClick"], row["Position"]) for row in rows]), ) uid_sid[uid].append(item) print "uid_sid: %s"%len(uid_sid) for uid in uid_sid: uid_sid[uid].sort() print "start user_cnt." file_name = "data/user_cnt_%s.csv"%args.sz with open(file_name, "w") as f: writer = DictWriter(f, fieldnames=["SearchID", "t_cnt", "bf_cnt", "af_cnt", "bf_3h_cnt", "af_3h_cnt", "bf_clk_cnt", "bag2", "bag1"]) writer.writeheader() for uid in uid_sid: all_se = uid_sid[uid] writer.writerows(get_rows(all_se)) os.system('sort -t"," -k1 -g -S 2G %s -o %s_sorted'%(file_name, file_name)) print "start user_aid_cnt." file_name = "data/user_aid_cnt_%s.csv"%args.sz with open(file_name, "w") as f: writer = DictWriter(f, fieldnames=["SearchID", "AdID", "clk_cnt", "show_cnt", "t_show_cnt", "pos_clk_cnt", "pos_show_cnt"]) writer.writeheader() for uid in uid_sid: all_se = uid_sid[uid] writer.writerows(get_aid_rows(uid, all_se)) os.system('sort -t"," -k1 -g -S 2G %s -o %s_sorted'%(file_name, file_name))
def get_ad_info(aid): ad_info = db.ad_info.find_one({"AdID": aid}) return trans_ad_info(ad_info) uid_cnt_dict = read_dump("data/uid_cnt.dump") adid_cnt_dict = read_dump("data/adid_cnt.dump") ipid_cnt_dict = read_dump("data/ipid_cnt.dump") query_cnt_dict = read_dump("data/query_cnt.dump") title_cnt_dict = read_dump("data/title_cnt.dump") query_param_cnt_dict = read_dump("data/query_param_cnt.dump") ad_param_cnt_dict = read_dump("data/ad_param_cnt.dump") user_info_map = get_user_info() category_map = get_category() user_cnt_iter = read_tsv("data/user_cnt.csv", delimiter=",") user_aid_cnt_iter = next_row(read_tsv("data/user_aid_cnt.csv", delimiter=",")) main()
def main(): random.seed(args.seed) xgb_set = set([ "price_pos", "ot1_cnt", "bf_cnt", "bf_clk_cnt", "u_aid_ctr", "record_cnt", "show_cnt", "clk_cnt", "t_cnt", "qe_w_pos", "HistCTR", "qe_ng_min_pos", "t_show_cnt", "bf_ctr", "ot2_cnt", "Price", "qe_ng_cnt", "title_len", "hl_ucnt", "price_ratio", "hl_lcnt", "t_match", "qe_w_ratio", "qe_ng_ratio", "Position", "bf_3h_cnt", "qe_w_cnt", "af_cnt", "ot3_cnt", "af_3h_cnt", "adid_cnt", "IsUserLoggedOn", ]) xgb_sparse_set = set([ "pos_ot_type", "pos_type", "ca_match", "ca_pid_match", "CategoryID", "s_LocationID", "s_CategoryID", "UserAgentFamilyID", "UserAgentOSID", "UserDeviceID", "UserAgentID", "UserID", "IPID", "AdID", "SearchParams", "Params", "Title", "SearchQuery" ]) if args.test: fh_list = [ open("data/tr_%s.%s" % (args.test, args.type), "w"), open("data/cv_%s.%s" % (args.test, args.type), "w"), open("data/te_%s.%s" % (args.test, args.type), "w") ] else: fh_list = [ open("data/tr.%s" % (args.type), "w"), open("data/cv.%s" % (args.type), "w"), open("data/te.%s" % (args.type), "w") ] if args.sz is not None: train_iter = next_row(read_tsv("data/stream_%s.tsv" % args.sz)) test_iter = iter([]) sinfo_iter = read_tsv("data/sinfo_%s.tsv" % args.sz) data_iter = data(args.test, train_iter=train_iter, test_iter=test_iter, sinfo_iter=sinfo_iter, maxlines=args.maxl) else: data_iter = data(args.test, maxlines=args.maxl) print "sr: %s" % args.sr avg_ctr = defaultdict(lambda: [0, 0]) for line_cnt, (data_type, rows, sinfo) in enumerate(data_iter): sinfo["s_LocationID"] = int(sinfo["LocationID"]) sinfo["s_CategoryID"] = int(sinfo["CategoryID"]) extract_slot_feas(rows, sinfo) rows = filter(lambda x: filter_row(x, data_type, sr=args.sr), rows) if not rows: continue feature_map = get_features(sinfo, rows, data_type > 0) instances = extract(feature_map) if line_cnt == 0: for k, feas in feature_map.items(): print "-" * 80 print k print feas[0].keys() feas_name = sorted(instances[0].keys()) print len(feas_name), feas_name if args.sz is not None: write_dump("feas_name.dump", feas_name) elif args.test: write_dump("feas_name%s.dump" % args.test, feas_name) else: write_dump("feas_name.dump", feas_name) # date_str = sinfo["SearchDate"] # ts = convert_ts(datetime.strptime(date_str, "%Y-%m-%d %H:%M:%S.0")) fh = fh_list[data_type] for ins_map, row in zip(instances, rows): y = int(row.get("IsClick", 0)) avg_ctr[data_type][0] += y avg_ctr[data_type][1] += 1 ins = [] for kt, k in enumerate(feas_name): if "xgb" in args.type: if k in xgb_set: hash_type = "xgb" elif k in xgb_sparse_set: hash_type = "xgb2" else: if line_cnt == 0: print "drop %s" % k continue else: hash_type = "" feas = ins_map[k] if line_cnt == 0: print kt, k, type(feas), feas if isinstance(feas, list) or isinstance(feas, tuple): for f in feas: ins.append(hash_val(kt + 1, f, hash_type)) else: ins.append(hash_val(kt + 1, feas, hash_type)) fh.write(unicode(y) + " " + " ".join(map(unicode, ins)) + "\n") for key, value in avg_ctr.items(): print "%s, %s" % (key, value[0] * 1. / value[1]) for fh in fh_list: fh.close()
parser = argparse.ArgumentParser(description='Process some integers.') parser.add_argument('--test', type=int, default=0) parser.add_argument('--mongo', type=int, default=0) parser.add_argument('--sz', type=int, default=None) parser.add_argument('--maxl', type=int, default=1e6) parser.add_argument('--type', type=str, default="ins") parser.add_argument('--sr', type=float, default=0.1) parser.add_argument('--seed', type=int, default=9) parser.add_argument('--date', type=int, default=0) parser.add_argument('--log', type=int, default=1) args = parser.parse_args() if args.mongo: from pymongo import MongoClient import functools32 as functools client = MongoClient('localhost', 27017) db = client.test @functools.lru_cache(maxsize=1000000) def get_ad_info(aid): ad_info = db.ad_info.find_one({"AdID": aid}) return trans_ad_info(ad_info) user_info_map = get_user_info() category_map = get_category() user_cnt_iter = read_tsv("data/user_cnt_%s.csv_sorted" % args.sz, delimiter=",") user_aid_cnt_iter = next_row( read_tsv("data/user_aid_cnt_%s.csv_sorted" % args.sz, delimiter=",")) main()
if __name__ == '__main__': parser = argparse.ArgumentParser(description='Process some integers.') parser.add_argument('--test', type=int, default=0) parser.add_argument('--mongo', type=int, default=0) parser.add_argument('--sz', type=int, default=None) parser.add_argument('--maxl', type=int, default=1e6) parser.add_argument('--type', type=str, default="ins") parser.add_argument('--sr', type=float, default=0.1) parser.add_argument('--seed', type=int, default=9) args = parser.parse_args() if args.mongo: from pymongo import MongoClient import functools32 as functools client = MongoClient('localhost', 27017) db = client.test @functools.lru_cache(maxsize=1000000) def get_ad_info(aid): ad_info = db.ad_info.find_one({"AdID": aid}) return trans_ad_info(ad_info) user_info_map = get_user_info() category_map = get_category() user_cnt_iter = read_tsv("data/user_cnt.csv", delimiter=",") user_aid_cnt_iter = next_row( read_tsv("data/user_aid_cnt.csv", delimiter=",")) main()
def main(): random.seed(args.seed) xgb_set =set([ "price_pos", "ot1_cnt", "bf_cnt", "bf_clk_cnt", "u_aid_ctr", "record_cnt", "show_cnt", "clk_cnt", "t_cnt", "qe_w_pos", "HistCTR", "qe_ng_min_pos", "t_show_cnt", "bf_ctr", "ot2_cnt", "Price", "qe_ng_cnt", "title_len", "hl_ucnt", "price_ratio", "hl_lcnt", "t_match", "qe_w_ratio", "qe_ng_ratio", "Position", "bf_3h_cnt", "qe_w_cnt", "af_cnt", "ot3_cnt", "af_3h_cnt", "adid_cnt", "IsUserLoggedOn", ]) xgb_sparse_set = set([ "pos_ot_type", "pos_type", "ca_match", "ca_pid_match", "CategoryID", "s_LocationID", "s_CategoryID", "UserAgentFamilyID", "UserAgentOSID", "UserDeviceID", "UserAgentID", "UserID", "IPID", "AdID", "SearchParams", "Params", "Title", "SearchQuery" ]) if args.test: fh_list = [ open("data/tr_%s.%s"%(args.test, args.type), "w"), open("data/cv_%s.%s"%(args.test, args.type), "w"), open("data/te_%s.%s"%(args.test, args.type), "w")] else: fh_list = [open("data/tr.%s"%(args.type), "w"), open("data/cv.%s"%(args.type), "w"), open("data/te.%s"%(args.type), "w")] if args.sz is not None: train_iter = next_row(read_tsv("data/stream_%s.tsv"%args.sz)) test_iter = iter([]) sinfo_iter = read_tsv("data/sinfo_%s.tsv"%args.sz) data_iter = data(args.test, train_iter=train_iter, test_iter=test_iter, sinfo_iter=sinfo_iter, maxlines=args.maxl) else: data_iter = data(args.test, maxlines=args.maxl) print "sr: %s"%args.sr avg_ctr = defaultdict(lambda : [0, 0]) for line_cnt, (data_type, rows, sinfo) in enumerate(data_iter): sinfo["s_LocationID"] = int(sinfo["LocationID"]) sinfo["s_CategoryID"] = int(sinfo["CategoryID"]) extract_slot_feas(rows, sinfo) rows = filter(lambda x: filter_row(x, data_type, sr=args.sr), rows) if not rows: continue feature_map = get_features(sinfo, rows, data_type > 0) instances = extract(feature_map) if line_cnt == 0: for k, feas in feature_map.items(): print "-" * 80 print k print feas[0].keys() feas_name = sorted(instances[0].keys()) print len(feas_name), feas_name if args.sz is not None: write_dump("feas_name.dump", feas_name) elif args.test: write_dump("feas_name%s.dump"%args.test, feas_name) else: write_dump("feas_name.dump", feas_name) # date_str = sinfo["SearchDate"] # ts = convert_ts(datetime.strptime(date_str, "%Y-%m-%d %H:%M:%S.0")) fh = fh_list[data_type] for ins_map, row in zip(instances, rows): y = int(row.get("IsClick", 0)) avg_ctr[data_type][0] += y avg_ctr[data_type][1] += 1 ins = [] for kt, k in enumerate(feas_name): if "xgb" in args.type: if k in xgb_set: hash_type = "xgb" elif k in xgb_sparse_set: hash_type = "xgb2" else: if line_cnt == 0: print "drop %s"%k continue else: hash_type = "" feas = ins_map[k] if line_cnt == 0: print kt, k, type(feas), feas if isinstance(feas, list) or isinstance(feas, tuple): for f in feas: ins.append(hash_val(kt + 1, f, hash_type)) else: ins.append(hash_val(kt + 1, feas, hash_type)) fh.write(unicode(y) + " " + " ".join(map(unicode, ins)) + "\n") for key, value in avg_ctr.items(): print "%s, %s"%(key, value[0] * 1. / value[1]) for fh in fh_list: fh.close()
args = parser.parse_args() if args.mongo: from pymongo import MongoClient import functools32 as functools client = MongoClient('localhost', 27017) db = client.test @functools.lru_cache(maxsize=1000000) def get_ad_info(aid): ad_info = db.ad_info.find_one({"AdID": aid}) return trans_ad_info(ad_info) user_info_map = get_user_info() category_map = get_category() user_cnt_iter = read_tsv("data/user_cnt_%s.csv_sorted"%args.sz, delimiter=",") user_aid_cnt_iter = next_row(read_tsv("data/user_aid_cnt_%s.csv_sorted"%args.sz, delimiter=",")) main()