def cv_method(): tr_X, tr_y_true, te_X, te_y_true = get_train_data() if "nn" in args.model: tr_X = np.array(tr_X).astype(np.float32) tr_y_true = np.array(tr_y_true).astype(np.int32) model = get_nn_model(tr_X.shape) model.fit(tr_X, tr_y_true) write_dump("%s_model.dump" % args.model, model) if te_X: te_X = np.array(te_X).astype(np.float32) preds = model.predict_proba(te_X)[:, 1] np.savetxt("nn_preds.txt", preds) print log_loss(te_y_true, preds) elif "xgb" in args.model: dtrain = xgb.DMatrix(tr_X, label=tr_y_true) if args.predict == "cv": if te_X: dtest = xgb.DMatrix(te_X, label=te_y_true) param = { 'max_depth': 3, 'eta': 0.1, 'silent': 1, 'objective': 'binary:logistic', "eval_metric": "logloss", "nthread": 9, } if te_X: watchlist = [(dtrain, 'train'), (dtest, "eval")] else: watchlist = [ (dtrain, 'train'), ] num_round = 132 bst = xgb.train(param, dtrain, num_round, watchlist) bst.save_model("%s_model.dump" % args.model) if te_X: preds = bst.predict(dtest) np.savetxt("xgb_preds.txt", preds)
def cv_method(): tr_X, tr_y_true, te_X, te_y_true = get_train_data() if "nn" in args.model: tr_X = np.array(tr_X).astype(np.float32) tr_y_true = np.array(tr_y_true).astype(np.int32) model = get_nn_model(tr_X.shape) model.fit(tr_X, tr_y_true) write_dump("%s_model.dump" % args.model, model) if te_X: te_X = np.array(te_X).astype(np.float32) preds = model.predict_proba(te_X)[:, 1] np.savetxt("nn_preds.txt", preds) print log_loss(te_y_true, preds) elif "xgb" in args.model: dtrain = xgb.DMatrix(tr_X, label=tr_y_true) if args.predict == "cv": if te_X: dtest = xgb.DMatrix(te_X, label=te_y_true) param = { "max_depth": 3, "eta": 0.1, "silent": 1, "objective": "binary:logistic", "eval_metric": "logloss", "nthread": 9, } if te_X: watchlist = [(dtrain, "train"), (dtest, "eval")] else: watchlist = [(dtrain, "train")] num_round = 132 bst = xgb.train(param, dtrain, num_round, watchlist) bst.save_model("%s_model.dump" % args.model) if te_X: preds = bst.predict(dtest) np.savetxt("xgb_preds.txt", preds)
def main(): random.seed(args.seed) xgb_set =set([ "price_pos", "ot1_cnt", "bf_cnt", "bf_clk_cnt", "u_aid_ctr", "record_cnt", "show_cnt", "clk_cnt", "t_cnt", "qe_w_pos", "HistCTR", "qe_ng_min_pos", "t_show_cnt", "bf_ctr", "ot2_cnt", "Price", "qe_ng_cnt", "title_len", "hl_ucnt", "price_ratio", "hl_lcnt", "t_match", "qe_w_ratio", "qe_ng_ratio", "Position", "bf_3h_cnt", "qe_w_cnt", "af_cnt", "ot3_cnt", "af_3h_cnt", "adid_cnt", "IsUserLoggedOn", ]) xgb_sparse_set = set([ "pos_ot_type", "pos_type", "ca_match", "ca_pid_match", "CategoryID", "s_LocationID", "s_CategoryID", "UserAgentFamilyID", "UserAgentOSID", "UserDeviceID", "UserAgentID", "UserID", "IPID", "AdID", "SearchParams", "Params", "Title", "SearchQuery" ]) if args.test: fh_list = [ open("data/tr_%s.%s"%(args.test, args.type), "w"), open("data/cv_%s.%s"%(args.test, args.type), "w"), open("data/te_%s.%s"%(args.test, args.type), "w")] else: fh_list = [open("data/tr.%s"%(args.type), "w"), open("data/cv.%s"%(args.type), "w"), open("data/te.%s"%(args.type), "w")] data_iter = data(args.test, maxlines=args.maxl) print "sr: %s"%args.sr avg_ctr = defaultdict(lambda : [0, 0]) for line_cnt, (data_type, rows, sinfo) in enumerate(data_iter): sinfo["s_LocationID"] = int(sinfo["LocationID"]) sinfo["s_CategoryID"] = int(sinfo["CategoryID"]) extract_slot_feas(rows, sinfo) rows = filter(lambda x: filter_row(x, data_type, sr=args.sr), rows) if not rows: continue feature_map = get_features(sinfo, rows, data_type > 0) instances = extract(feature_map) if line_cnt == 0: for k, feas in feature_map.items(): print "-" * 80 print k print feas[0].keys() feas_name = sorted(instances[0].keys()) print len(feas_name), feas_name if args.sz is not None: write_dump("feas_name.dump", feas_name) elif args.test: write_dump("feas_name%s.dump"%args.test, feas_name) else: write_dump("feas_name.dump", feas_name) # date_str = sinfo["SearchDate"] # ts = convert_ts(datetime.strptime(date_str, "%Y-%m-%d %H:%M:%S.0")) fh = fh_list[data_type] for ins_map, row in zip(instances, rows): y = int(row.get("IsClick", 0)) avg_ctr[data_type][0] += y avg_ctr[data_type][1] += 1 ins = [] for kt, k in enumerate(feas_name): if "xgb" in args.type: if k in xgb_set: hash_type = "xgb" elif k in xgb_sparse_set: hash_type = "xgb2" else: if line_cnt == 0: print "drop %s"%k continue else: hash_type = "" feas = ins_map[k] if line_cnt == 0: print kt, k, type(feas), feas if isinstance(feas, list) or isinstance(feas, tuple): for f in feas: ins.append(hash_val(kt + 1, f, hash_type)) else: ins.append(hash_val(kt + 1, feas, hash_type)) fh.write(unicode(y) + " " + " ".join(map(unicode, ins)) + "\n") for key, value in avg_ctr.items(): print "%s, %s"%(key, value[0] * 1. / value[1]) for fh in fh_list: fh.close()
def main(): random.seed(args.seed) xgb_set = set([ "price_pos", "ot1_cnt", "bf_cnt", "bf_clk_cnt", "u_aid_ctr", "record_cnt", "show_cnt", "clk_cnt", "t_cnt", "qe_w_pos", "HistCTR", "qe_ng_min_pos", "t_show_cnt", "bf_ctr", "ot2_cnt", "Price", "qe_ng_cnt", "title_len", "hl_ucnt", "price_ratio", "hl_lcnt", "t_match", "qe_w_ratio", "qe_ng_ratio", "Position", "bf_3h_cnt", "qe_w_cnt", "af_cnt", "ot3_cnt", "af_3h_cnt", "adid_cnt", "IsUserLoggedOn", ]) xgb_sparse_set = set([ "pos_ot_type", "pos_type", "ca_match", "ca_pid_match", "CategoryID", "s_LocationID", "s_CategoryID", "UserAgentFamilyID", "UserAgentOSID", "UserDeviceID", "UserAgentID", "UserID", "IPID", "AdID", "SearchParams", "Params", "Title", "SearchQuery" ]) if args.test: fh_list = [ open("data/tr_%s.%s" % (args.test, args.type), "w"), open("data/cv_%s.%s" % (args.test, args.type), "w"), open("data/te_%s.%s" % (args.test, args.type), "w") ] else: fh_list = [ open("data/tr.%s" % (args.type), "w"), open("data/cv.%s" % (args.type), "w"), open("data/te.%s" % (args.type), "w") ] if args.sz is not None: train_iter = next_row(read_tsv("data/stream_%s.tsv" % args.sz)) test_iter = iter([]) sinfo_iter = read_tsv("data/sinfo_%s.tsv" % args.sz) data_iter = data(args.test, train_iter=train_iter, test_iter=test_iter, sinfo_iter=sinfo_iter, maxlines=args.maxl) else: data_iter = data(args.test, maxlines=args.maxl) print "sr: %s" % args.sr avg_ctr = defaultdict(lambda: [0, 0]) for line_cnt, (data_type, rows, sinfo) in enumerate(data_iter): sinfo["s_LocationID"] = int(sinfo["LocationID"]) sinfo["s_CategoryID"] = int(sinfo["CategoryID"]) extract_slot_feas(rows, sinfo) rows = filter(lambda x: filter_row(x, data_type, sr=args.sr), rows) if not rows: continue feature_map = get_features(sinfo, rows, data_type > 0) instances = extract(feature_map) if line_cnt == 0: for k, feas in feature_map.items(): print "-" * 80 print k print feas[0].keys() feas_name = sorted(instances[0].keys()) print len(feas_name), feas_name if args.sz is not None: write_dump("feas_name.dump", feas_name) elif args.test: write_dump("feas_name%s.dump" % args.test, feas_name) else: write_dump("feas_name.dump", feas_name) # date_str = sinfo["SearchDate"] # ts = convert_ts(datetime.strptime(date_str, "%Y-%m-%d %H:%M:%S.0")) fh = fh_list[data_type] for ins_map, row in zip(instances, rows): y = int(row.get("IsClick", 0)) avg_ctr[data_type][0] += y avg_ctr[data_type][1] += 1 ins = [] for kt, k in enumerate(feas_name): if "xgb" in args.type: if k in xgb_set: hash_type = "xgb" elif k in xgb_sparse_set: hash_type = "xgb2" else: if line_cnt == 0: print "drop %s" % k continue else: hash_type = "" feas = ins_map[k] if line_cnt == 0: print kt, k, type(feas), feas if isinstance(feas, list) or isinstance(feas, tuple): for f in feas: ins.append(hash_val(kt + 1, f, hash_type)) else: ins.append(hash_val(kt + 1, feas, hash_type)) fh.write(unicode(y) + " " + " ".join(map(unicode, ins)) + "\n") for key, value in avg_ctr.items(): print "%s, %s" % (key, value[0] * 1. / value[1]) for fh in fh_list: fh.close()
def main(): random.seed(args.seed) xgb_set = set([ "pos_type", "price_pos", "ot1_cnt", "pos_ot_type", "bf_cnt", "bf_clk_cnt", "u_aid_ctr", "record_cnt", "show_cnt", "clk_cnt", "t_cnt", "qe_w_pos", "HistCTR", "qe_ng_min_pos", "t_show_cnt", "bf_ctr", "ot2_cnt", "Price", "qe_ng_cnt", "title_len", "hl_ucnt", "price_ratio", "hl_lcnt", "t_match", "qe_w_ratio", "qe_ng_ratio", "ca_match", "Position", "bf_3h_cnt", "qe_w_cnt", "af_cnt", "ot3_cnt", "ca_pid_match", "af_3h_cnt", ]) if args.test: fh_list = [ open("data/tr_%s.%s" % (args.test, args.type), "w"), open("data/cv_%s.%s" % (args.test, args.type), "w"), open("data/te_%s.%s" % (args.test, args.type), "w") ] else: fh_list = [ open("data/tr.%s" % (args.type), "w"), open("data/cv.%s" % (args.type), "w"), open("data/te.%s" % (args.type), "w") ] data_iter = data(args.test, maxlines=args.maxl) avg_ctr = defaultdict(lambda: [0, 0]) for line_cnt, (data_type, rows, sinfo) in enumerate(data_iter): sinfo["s_LocationID"] = int(sinfo["LocationID"]) sinfo["s_CategoryID"] = int(sinfo["CategoryID"]) extract_slot_feas(rows, sinfo) rows = filter(lambda x: filter_row(x, data_type), rows) if not rows: continue feature_map = get_features(sinfo, rows, data_type > 0) instances = extract(feature_map) if line_cnt == 0: for k, feas in feature_map.items(): print "-" * 80 print k print feas[0].keys() feas_name = sorted(instances[0].keys()) print len(feas_name), feas_name if args.sz is not None: write_dump("feas_name.dump", feas_name) elif args.test: write_dump("feas_name%s.dump" % args.test, feas_name) else: write_dump("feas_name.dump", feas_name) fh = fh_list[data_type] for ins_map, row in zip(instances, rows): y = int(row.get("IsClick", 0)) avg_ctr[data_type][0] += y avg_ctr[data_type][1] += 1 ins = [] for kt, k in enumerate(feas_name): if args.type == "xgb" and k not in xgb_set: continue feas = ins_map[k] if line_cnt == 0: print kt, k, type(feas), feas if isinstance(feas, list) or isinstance(feas, tuple): for f in feas: ins.append(hash_val(kt + 1, f, args.type)) else: ins.append(hash_val(kt + 1, feas, args.type)) fh.write(unicode(y) + " " + " ".join(map(unicode, ins)) + "\n") for key, value in avg_ctr.items(): print "%s, %s" % (key, value[0] * 1. / value[1]) for fh in fh_list: fh.close()
def main(): random.seed(args.seed) data_iter = data(args.test, maxlines=args.maxl) print "sr: %s"%args.sr uid_cnt = defaultdict(int) ipid_cnt = defaultdict(int) adid_cnt = defaultdict(int) query_cnt = defaultdict(int) title_cnt = defaultdict(int) query_param_cnt = defaultdict(int) ad_param_cnt = defaultdict(int) for line_cnt, (data_type, rows, sinfo) in enumerate(data_iter): rows = filter(lambda x: filter_row(x, data_type, sr=args.sr), rows) if not rows: continue ipid, uid = map(int, (sinfo["IPID"], sinfo["UserID"])) uid_cnt[uid] += len(rows) ipid_cnt[ipid] += len(rows) query = unicode(sinfo["SearchQuery"], "utf-8") val = map(lambda x : hash_val(0, x), query.split()) for v in val: query_cnt[v] += len(rows) sid = int(sinfo["SearchID"]) for v in get_se_param(sid): query_param_cnt[v] += len(rows) for row in rows: aid = int(row["AdID"]) adid_cnt[aid] += 1 ad_info = get_ad_info(aid) for v in ad_info["Params"]: ad_param_cnt[v] += 1 title = ad_info["Title"] title_val = map(lambda x : hash_val(0, x), title.split()) for v in title_val: title_cnt[v] += 1 if line_cnt % 100000 == 0: print "uid_cnt: %s"%len(uid_cnt) print "ipid_cnt: %s"%len(ipid_cnt) print "adid_cnt: %s"%len(adid_cnt) print "query_cnt: %s"%len(query_cnt) print "title_cnt: %s"%len(title_cnt) print "query_param_cnt: %s"%len(query_param_cnt) print "ad_param_cnt: %s"%len(ad_param_cnt) write_dump("data/uid_cnt.dump", uid_cnt) write_dump("data/ipid_cnt.dump", ipid_cnt) write_dump("data/adid_cnt.dump", adid_cnt) write_dump("data/query_cnt.dump", query_cnt) write_dump("data/title_cnt.dump", title_cnt) write_dump("data/query_param_cnt.dump", query_param_cnt) write_dump("data/ad_param_cnt.dump", ad_param_cnt)
def main(): random.seed(args.seed) xgb_set =set([ "pos_type", "price_pos", "ot1_cnt", "pos_ot_type", "bf_cnt", "bf_clk_cnt", "u_aid_ctr", "record_cnt", "show_cnt", "clk_cnt", "t_cnt", "qe_w_pos", "HistCTR", "qe_ng_min_pos", "t_show_cnt", "bf_ctr", "ot2_cnt", "Price", "qe_ng_cnt", "title_len", "hl_ucnt", "price_ratio", "hl_lcnt", "t_match", "qe_w_ratio", "qe_ng_ratio", "ca_match", "Position", "bf_3h_cnt", "qe_w_cnt", "af_cnt", "ot3_cnt", "ca_pid_match", "af_3h_cnt", ]) if args.test: fh_list = [ open("data/tr_%s.%s"%(args.test, args.type), "w"), open("data/cv_%s.%s"%(args.test, args.type), "w"), open("data/te_%s.%s"%(args.test, args.type), "w")] else: fh_list = [open("data/tr.%s"%(args.type), "w"), open("data/cv.%s"%(args.type), "w"), open("data/te.%s"%(args.type), "w")] data_iter = data(args.test, maxlines=args.maxl) avg_ctr = defaultdict(lambda : [0, 0]) for line_cnt, (data_type, rows, sinfo) in enumerate(data_iter): sinfo["s_LocationID"] = int(sinfo["LocationID"]) sinfo["s_CategoryID"] = int(sinfo["CategoryID"]) extract_slot_feas(rows, sinfo) rows = filter(lambda x: filter_row(x, data_type), rows) if not rows: continue feature_map = get_features(sinfo, rows, data_type > 0) instances = extract(feature_map) if line_cnt == 0: for k, feas in feature_map.items(): print "-" * 80 print k print feas[0].keys() feas_name = sorted(instances[0].keys()) print len(feas_name), feas_name if args.sz is not None: write_dump("feas_name.dump", feas_name) elif args.test: write_dump("feas_name%s.dump"%args.test, feas_name) else: write_dump("feas_name.dump", feas_name) fh = fh_list[data_type] for ins_map, row in zip(instances, rows): y = int(row.get("IsClick", 0)) avg_ctr[data_type][0] += y avg_ctr[data_type][1] += 1 ins = [] for kt, k in enumerate(feas_name): if args.type == "xgb" and k not in xgb_set: continue feas = ins_map[k] if line_cnt == 0: print kt, k, type(feas), feas if isinstance(feas, list) or isinstance(feas, tuple): for f in feas: ins.append(hash_val(kt + 1, f, args.type)) else: ins.append(hash_val(kt + 1, feas, args.type)) fh.write(unicode(y) + " " + " ".join(map(unicode, ins)) + "\n") for key, value in avg_ctr.items(): print "%s, %s"%(key, value[0] * 1. / value[1]) for fh in fh_list: fh.close()
def main(): random.seed(args.seed) data_iter = data(args.test, maxlines=args.maxl) print "sr: %s" % args.sr uid_cnt = defaultdict(int) ipid_cnt = defaultdict(int) adid_cnt = defaultdict(int) query_cnt = defaultdict(int) title_cnt = defaultdict(int) query_param_cnt = defaultdict(int) ad_param_cnt = defaultdict(int) for line_cnt, (data_type, rows, sinfo) in enumerate(data_iter): rows = filter(lambda x: filter_row(x, data_type, sr=args.sr), rows) if not rows: continue ipid, uid = map(int, (sinfo["IPID"], sinfo["UserID"])) uid_cnt[uid] += len(rows) ipid_cnt[ipid] += len(rows) query = unicode(sinfo["SearchQuery"], "utf-8") val = map(lambda x: hash_val(0, x), query.split()) for v in val: query_cnt[v] += len(rows) sid = int(sinfo["SearchID"]) for v in get_se_param(sid): query_param_cnt[v] += len(rows) for row in rows: aid = int(row["AdID"]) adid_cnt[aid] += 1 ad_info = get_ad_info(aid) for v in ad_info["Params"]: ad_param_cnt[v] += 1 title = ad_info["Title"] title_val = map(lambda x: hash_val(0, x), title.split()) for v in title_val: title_cnt[v] += 1 if line_cnt % 100000 == 0: print "uid_cnt: %s" % len(uid_cnt) print "ipid_cnt: %s" % len(ipid_cnt) print "adid_cnt: %s" % len(adid_cnt) print "query_cnt: %s" % len(query_cnt) print "title_cnt: %s" % len(title_cnt) print "query_param_cnt: %s" % len(query_param_cnt) print "ad_param_cnt: %s" % len(ad_param_cnt) write_dump("data/uid_cnt.dump", uid_cnt) write_dump("data/ipid_cnt.dump", ipid_cnt) write_dump("data/adid_cnt.dump", adid_cnt) write_dump("data/query_cnt.dump", query_cnt) write_dump("data/title_cnt.dump", title_cnt) write_dump("data/query_param_cnt.dump", query_param_cnt) write_dump("data/ad_param_cnt.dump", ad_param_cnt)