Exemplo n.º 1
0
def cv_method():
    tr_X, tr_y_true, te_X, te_y_true = get_train_data()
    if "nn" in args.model:
        tr_X = np.array(tr_X).astype(np.float32)
        tr_y_true = np.array(tr_y_true).astype(np.int32)
        model = get_nn_model(tr_X.shape)
        model.fit(tr_X, tr_y_true)
        write_dump("%s_model.dump" % args.model, model)
        if te_X:
            te_X = np.array(te_X).astype(np.float32)
            preds = model.predict_proba(te_X)[:, 1]
            np.savetxt("nn_preds.txt", preds)
            print log_loss(te_y_true, preds)
    elif "xgb" in args.model:
        dtrain = xgb.DMatrix(tr_X, label=tr_y_true)
        if args.predict == "cv":
            if te_X:
                dtest = xgb.DMatrix(te_X, label=te_y_true)
            param = {
                'max_depth': 3,
                'eta': 0.1,
                'silent': 1,
                'objective': 'binary:logistic',
                "eval_metric": "logloss",
                "nthread": 9,
            }
            if te_X:
                watchlist = [(dtrain, 'train'), (dtest, "eval")]
            else:
                watchlist = [
                    (dtrain, 'train'),
                ]
            num_round = 132
            bst = xgb.train(param, dtrain, num_round, watchlist)
            bst.save_model("%s_model.dump" % args.model)
            if te_X:
                preds = bst.predict(dtest)
                np.savetxt("xgb_preds.txt", preds)
Exemplo n.º 2
0
def cv_method():
    tr_X, tr_y_true, te_X, te_y_true = get_train_data()
    if "nn" in args.model:
        tr_X = np.array(tr_X).astype(np.float32)
        tr_y_true = np.array(tr_y_true).astype(np.int32)
        model = get_nn_model(tr_X.shape)
        model.fit(tr_X, tr_y_true)
        write_dump("%s_model.dump" % args.model, model)
        if te_X:
            te_X = np.array(te_X).astype(np.float32)
            preds = model.predict_proba(te_X)[:, 1]
            np.savetxt("nn_preds.txt", preds)
            print log_loss(te_y_true, preds)
    elif "xgb" in args.model:
        dtrain = xgb.DMatrix(tr_X, label=tr_y_true)
        if args.predict == "cv":
            if te_X:
                dtest = xgb.DMatrix(te_X, label=te_y_true)
            param = {
                "max_depth": 3,
                "eta": 0.1,
                "silent": 1,
                "objective": "binary:logistic",
                "eval_metric": "logloss",
                "nthread": 9,
            }
            if te_X:
                watchlist = [(dtrain, "train"), (dtest, "eval")]
            else:
                watchlist = [(dtrain, "train")]
            num_round = 132
            bst = xgb.train(param, dtrain, num_round, watchlist)
            bst.save_model("%s_model.dump" % args.model)
            if te_X:
                preds = bst.predict(dtest)
                np.savetxt("xgb_preds.txt", preds)
Exemplo n.º 3
0
def main():
    random.seed(args.seed)
    xgb_set =set([
        "price_pos", "ot1_cnt", "bf_cnt", "bf_clk_cnt", 
        "u_aid_ctr", "record_cnt", "show_cnt", "clk_cnt", 
        "t_cnt", "qe_w_pos", "HistCTR", "qe_ng_min_pos", "t_show_cnt", 
        "bf_ctr", "ot2_cnt", "Price", 
        "qe_ng_cnt", "title_len", "hl_ucnt", 
        "price_ratio", "hl_lcnt", "t_match", "qe_w_ratio", 
        "qe_ng_ratio", "Position", 
        "bf_3h_cnt", "qe_w_cnt", 
        "af_cnt", "ot3_cnt",
        "af_3h_cnt", "adid_cnt", "IsUserLoggedOn",
        ])
    xgb_sparse_set = set([
        "pos_ot_type", "pos_type",
        "ca_match", "ca_pid_match",
        "CategoryID", "s_LocationID", "s_CategoryID",
        "UserAgentFamilyID", "UserAgentOSID", 
        "UserDeviceID", "UserAgentID",
        "UserID", "IPID", "AdID",
        "SearchParams", "Params", "Title", "SearchQuery"
        ])
    if args.test:
        fh_list = [ open("data/tr_%s.%s"%(args.test, args.type), "w"), 
                    open("data/cv_%s.%s"%(args.test, args.type), "w"), 
                    open("data/te_%s.%s"%(args.test, args.type), "w")]
    else:
        fh_list = [open("data/tr.%s"%(args.type), "w"), 
                    open("data/cv.%s"%(args.type), "w"), 
                    open("data/te.%s"%(args.type), "w")]

    data_iter = data(args.test, maxlines=args.maxl)

    print "sr: %s"%args.sr
    avg_ctr = defaultdict(lambda : [0, 0])
    for line_cnt, (data_type, rows, sinfo) in enumerate(data_iter):
        sinfo["s_LocationID"] = int(sinfo["LocationID"])
        sinfo["s_CategoryID"] = int(sinfo["CategoryID"])
        extract_slot_feas(rows, sinfo)
        rows = filter(lambda x: filter_row(x, data_type, sr=args.sr), rows)
        if not rows:
            continue
        feature_map = get_features(sinfo, rows, data_type > 0)
        instances = extract(feature_map)
        if line_cnt == 0:
            for k, feas in feature_map.items():
                print "-" * 80
                print k
                print feas[0].keys()
            feas_name = sorted(instances[0].keys())
            print len(feas_name), feas_name
            if args.sz is not None:
                write_dump("feas_name.dump", feas_name)
            elif args.test:
                write_dump("feas_name%s.dump"%args.test, feas_name)
            else:
                write_dump("feas_name.dump", feas_name)

        # date_str = sinfo["SearchDate"]
        # ts = convert_ts(datetime.strptime(date_str, "%Y-%m-%d %H:%M:%S.0"))

        fh = fh_list[data_type]
        for ins_map, row in zip(instances, rows):
            y = int(row.get("IsClick", 0))
            avg_ctr[data_type][0] += y
            avg_ctr[data_type][1] += 1
            ins = []
            for kt, k in enumerate(feas_name):
                if "xgb" in args.type:
                    if k in xgb_set:
                        hash_type = "xgb"
                    elif k in xgb_sparse_set:
                        hash_type = "xgb2"
                    else:
                        if line_cnt == 0:
                            print "drop %s"%k
                        continue
                else:
                    hash_type = ""
                feas = ins_map[k]
                if line_cnt == 0:
                    print kt, k, type(feas), feas
                if isinstance(feas, list) or isinstance(feas, tuple):
                    for f in feas:
                        ins.append(hash_val(kt + 1, f, hash_type))
                else:
                    ins.append(hash_val(kt + 1, feas, hash_type))
            fh.write(unicode(y) + " " + " ".join(map(unicode, ins)) + "\n")
    for key, value in avg_ctr.items():
        print "%s, %s"%(key, value[0] * 1. / value[1])
    for fh in fh_list:
        fh.close()
Exemplo n.º 4
0
def main():
    random.seed(args.seed)
    xgb_set = set([
        "price_pos",
        "ot1_cnt",
        "bf_cnt",
        "bf_clk_cnt",
        "u_aid_ctr",
        "record_cnt",
        "show_cnt",
        "clk_cnt",
        "t_cnt",
        "qe_w_pos",
        "HistCTR",
        "qe_ng_min_pos",
        "t_show_cnt",
        "bf_ctr",
        "ot2_cnt",
        "Price",
        "qe_ng_cnt",
        "title_len",
        "hl_ucnt",
        "price_ratio",
        "hl_lcnt",
        "t_match",
        "qe_w_ratio",
        "qe_ng_ratio",
        "Position",
        "bf_3h_cnt",
        "qe_w_cnt",
        "af_cnt",
        "ot3_cnt",
        "af_3h_cnt",
        "adid_cnt",
        "IsUserLoggedOn",
    ])
    xgb_sparse_set = set([
        "pos_ot_type", "pos_type", "ca_match", "ca_pid_match", "CategoryID",
        "s_LocationID", "s_CategoryID", "UserAgentFamilyID", "UserAgentOSID",
        "UserDeviceID", "UserAgentID", "UserID", "IPID", "AdID",
        "SearchParams", "Params", "Title", "SearchQuery"
    ])
    if args.test:
        fh_list = [
            open("data/tr_%s.%s" % (args.test, args.type), "w"),
            open("data/cv_%s.%s" % (args.test, args.type), "w"),
            open("data/te_%s.%s" % (args.test, args.type), "w")
        ]
    else:
        fh_list = [
            open("data/tr.%s" % (args.type), "w"),
            open("data/cv.%s" % (args.type), "w"),
            open("data/te.%s" % (args.type), "w")
        ]

    if args.sz is not None:
        train_iter = next_row(read_tsv("data/stream_%s.tsv" % args.sz))
        test_iter = iter([])
        sinfo_iter = read_tsv("data/sinfo_%s.tsv" % args.sz)
        data_iter = data(args.test,
                         train_iter=train_iter,
                         test_iter=test_iter,
                         sinfo_iter=sinfo_iter,
                         maxlines=args.maxl)
    else:
        data_iter = data(args.test, maxlines=args.maxl)

    print "sr: %s" % args.sr
    avg_ctr = defaultdict(lambda: [0, 0])
    for line_cnt, (data_type, rows, sinfo) in enumerate(data_iter):
        sinfo["s_LocationID"] = int(sinfo["LocationID"])
        sinfo["s_CategoryID"] = int(sinfo["CategoryID"])
        extract_slot_feas(rows, sinfo)
        rows = filter(lambda x: filter_row(x, data_type, sr=args.sr), rows)
        if not rows:
            continue
        feature_map = get_features(sinfo, rows, data_type > 0)
        instances = extract(feature_map)
        if line_cnt == 0:
            for k, feas in feature_map.items():
                print "-" * 80
                print k
                print feas[0].keys()
            feas_name = sorted(instances[0].keys())
            print len(feas_name), feas_name
            if args.sz is not None:
                write_dump("feas_name.dump", feas_name)
            elif args.test:
                write_dump("feas_name%s.dump" % args.test, feas_name)
            else:
                write_dump("feas_name.dump", feas_name)

        # date_str = sinfo["SearchDate"]
        # ts = convert_ts(datetime.strptime(date_str, "%Y-%m-%d %H:%M:%S.0"))

        fh = fh_list[data_type]
        for ins_map, row in zip(instances, rows):
            y = int(row.get("IsClick", 0))
            avg_ctr[data_type][0] += y
            avg_ctr[data_type][1] += 1
            ins = []
            for kt, k in enumerate(feas_name):
                if "xgb" in args.type:
                    if k in xgb_set:
                        hash_type = "xgb"
                    elif k in xgb_sparse_set:
                        hash_type = "xgb2"
                    else:
                        if line_cnt == 0:
                            print "drop %s" % k
                        continue
                else:
                    hash_type = ""
                feas = ins_map[k]
                if line_cnt == 0:
                    print kt, k, type(feas), feas
                if isinstance(feas, list) or isinstance(feas, tuple):
                    for f in feas:
                        ins.append(hash_val(kt + 1, f, hash_type))
                else:
                    ins.append(hash_val(kt + 1, feas, hash_type))
            fh.write(unicode(y) + " " + " ".join(map(unicode, ins)) + "\n")
    for key, value in avg_ctr.items():
        print "%s, %s" % (key, value[0] * 1. / value[1])
    for fh in fh_list:
        fh.close()
Exemplo n.º 5
0
def main():
    random.seed(args.seed)
    xgb_set = set([
        "pos_type",
        "price_pos",
        "ot1_cnt",
        "pos_ot_type",
        "bf_cnt",
        "bf_clk_cnt",
        "u_aid_ctr",
        "record_cnt",
        "show_cnt",
        "clk_cnt",
        "t_cnt",
        "qe_w_pos",
        "HistCTR",
        "qe_ng_min_pos",
        "t_show_cnt",
        "bf_ctr",
        "ot2_cnt",
        "Price",
        "qe_ng_cnt",
        "title_len",
        "hl_ucnt",
        "price_ratio",
        "hl_lcnt",
        "t_match",
        "qe_w_ratio",
        "qe_ng_ratio",
        "ca_match",
        "Position",
        "bf_3h_cnt",
        "qe_w_cnt",
        "af_cnt",
        "ot3_cnt",
        "ca_pid_match",
        "af_3h_cnt",
    ])
    if args.test:
        fh_list = [
            open("data/tr_%s.%s" % (args.test, args.type), "w"),
            open("data/cv_%s.%s" % (args.test, args.type), "w"),
            open("data/te_%s.%s" % (args.test, args.type), "w")
        ]
    else:
        fh_list = [
            open("data/tr.%s" % (args.type), "w"),
            open("data/cv.%s" % (args.type), "w"),
            open("data/te.%s" % (args.type), "w")
        ]

    data_iter = data(args.test, maxlines=args.maxl)

    avg_ctr = defaultdict(lambda: [0, 0])
    for line_cnt, (data_type, rows, sinfo) in enumerate(data_iter):
        sinfo["s_LocationID"] = int(sinfo["LocationID"])
        sinfo["s_CategoryID"] = int(sinfo["CategoryID"])
        extract_slot_feas(rows, sinfo)
        rows = filter(lambda x: filter_row(x, data_type), rows)
        if not rows:
            continue
        feature_map = get_features(sinfo, rows, data_type > 0)
        instances = extract(feature_map)
        if line_cnt == 0:
            for k, feas in feature_map.items():
                print "-" * 80
                print k
                print feas[0].keys()
            feas_name = sorted(instances[0].keys())
            print len(feas_name), feas_name
            if args.sz is not None:
                write_dump("feas_name.dump", feas_name)
            elif args.test:
                write_dump("feas_name%s.dump" % args.test, feas_name)
            else:
                write_dump("feas_name.dump", feas_name)

        fh = fh_list[data_type]
        for ins_map, row in zip(instances, rows):
            y = int(row.get("IsClick", 0))
            avg_ctr[data_type][0] += y
            avg_ctr[data_type][1] += 1
            ins = []
            for kt, k in enumerate(feas_name):
                if args.type == "xgb" and k not in xgb_set:
                    continue
                feas = ins_map[k]
                if line_cnt == 0:
                    print kt, k, type(feas), feas
                if isinstance(feas, list) or isinstance(feas, tuple):
                    for f in feas:
                        ins.append(hash_val(kt + 1, f, args.type))
                else:
                    ins.append(hash_val(kt + 1, feas, args.type))
            fh.write(unicode(y) + " " + " ".join(map(unicode, ins)) + "\n")
    for key, value in avg_ctr.items():
        print "%s, %s" % (key, value[0] * 1. / value[1])
    for fh in fh_list:
        fh.close()
Exemplo n.º 6
0
def main():
    random.seed(args.seed)
    data_iter = data(args.test, maxlines=args.maxl)
    print "sr: %s"%args.sr

    uid_cnt = defaultdict(int)
    ipid_cnt = defaultdict(int)
    adid_cnt = defaultdict(int)
    query_cnt = defaultdict(int)
    title_cnt = defaultdict(int)
    query_param_cnt = defaultdict(int)
    ad_param_cnt = defaultdict(int)

    for line_cnt, (data_type, rows, sinfo) in enumerate(data_iter):
        rows = filter(lambda x: filter_row(x, data_type, sr=args.sr), rows)
        if not rows:
            continue
        ipid, uid = map(int, (sinfo["IPID"], sinfo["UserID"]))
        uid_cnt[uid] += len(rows)
        ipid_cnt[ipid] += len(rows)

        query = unicode(sinfo["SearchQuery"], "utf-8")
        val = map(lambda x : hash_val(0, x), query.split())
        for v in val:
            query_cnt[v] += len(rows)

        sid = int(sinfo["SearchID"])
        for v in get_se_param(sid):
            query_param_cnt[v] += len(rows)

        for row in rows:
            aid = int(row["AdID"])
            adid_cnt[aid] += 1

            ad_info = get_ad_info(aid)
            for v in ad_info["Params"]:
                ad_param_cnt[v] += 1

            title = ad_info["Title"]
            title_val = map(lambda x : hash_val(0, x), title.split())
            for v in title_val:
                title_cnt[v] += 1
        if line_cnt % 100000 == 0:
            print "uid_cnt: %s"%len(uid_cnt)
            print "ipid_cnt: %s"%len(ipid_cnt)
            print "adid_cnt: %s"%len(adid_cnt)
            print "query_cnt: %s"%len(query_cnt)
            print "title_cnt: %s"%len(title_cnt)
            print "query_param_cnt: %s"%len(query_param_cnt)
            print "ad_param_cnt: %s"%len(ad_param_cnt)

    write_dump("data/uid_cnt.dump", uid_cnt)
    write_dump("data/ipid_cnt.dump", ipid_cnt)
    write_dump("data/adid_cnt.dump", adid_cnt)
    write_dump("data/query_cnt.dump", query_cnt)
    write_dump("data/title_cnt.dump", title_cnt)
    write_dump("data/query_param_cnt.dump", query_param_cnt)
    write_dump("data/ad_param_cnt.dump", ad_param_cnt)
Exemplo n.º 7
0
def main():
    random.seed(args.seed)
    xgb_set =set([
        "pos_type", "price_pos", "ot1_cnt", "pos_ot_type",
        "bf_cnt", "bf_clk_cnt", "u_aid_ctr", "record_cnt",
        "show_cnt", "clk_cnt", "t_cnt", "qe_w_pos",
        "HistCTR", "qe_ng_min_pos", "t_show_cnt", "bf_ctr",
        "ot2_cnt", "Price", "qe_ng_cnt", "title_len",
        "hl_ucnt", "price_ratio", "hl_lcnt", "t_match",
        "qe_w_ratio", "qe_ng_ratio", "ca_match", "Position",
        "bf_3h_cnt", "qe_w_cnt", "af_cnt", "ot3_cnt",
        "ca_pid_match", "af_3h_cnt",
        ])
    if args.test:
        fh_list = [ open("data/tr_%s.%s"%(args.test, args.type), "w"), 
                    open("data/cv_%s.%s"%(args.test, args.type), "w"), 
                    open("data/te_%s.%s"%(args.test, args.type), "w")]
    else:
        fh_list = [open("data/tr.%s"%(args.type), "w"), 
                    open("data/cv.%s"%(args.type), "w"), 
                    open("data/te.%s"%(args.type), "w")]

    data_iter = data(args.test, maxlines=args.maxl)
 
    avg_ctr = defaultdict(lambda : [0, 0])
    for line_cnt, (data_type, rows, sinfo) in enumerate(data_iter):
        sinfo["s_LocationID"] = int(sinfo["LocationID"])
        sinfo["s_CategoryID"] = int(sinfo["CategoryID"])
        extract_slot_feas(rows, sinfo)
        rows = filter(lambda x: filter_row(x, data_type), rows)
        if not rows:
            continue
        feature_map = get_features(sinfo, rows, data_type > 0)
        instances = extract(feature_map)
        if line_cnt == 0:
            for k, feas in feature_map.items():
                print "-" * 80
                print k
                print feas[0].keys()
            feas_name = sorted(instances[0].keys())
            print len(feas_name), feas_name
            if args.sz is not None:
                write_dump("feas_name.dump", feas_name)
            elif args.test:
                write_dump("feas_name%s.dump"%args.test, feas_name)
            else:
                write_dump("feas_name.dump", feas_name)

        fh = fh_list[data_type]
        for ins_map, row in zip(instances, rows):
            y = int(row.get("IsClick", 0))
            avg_ctr[data_type][0] += y
            avg_ctr[data_type][1] += 1
            ins = []
            for kt, k in enumerate(feas_name):
                if args.type == "xgb" and k not in xgb_set:
                    continue
                feas = ins_map[k]
                if line_cnt == 0:
                    print kt, k, type(feas), feas
                if isinstance(feas, list) or isinstance(feas, tuple):
                    for f in feas:
                        ins.append(hash_val(kt + 1, f, args.type))
                else:
                    ins.append(hash_val(kt + 1, feas, args.type))
            fh.write(unicode(y) + " " + " ".join(map(unicode, ins)) + "\n")
    for key, value in avg_ctr.items():
        print "%s, %s"%(key, value[0] * 1. / value[1])
    for fh in fh_list:
        fh.close()
Exemplo n.º 8
0
def main():
    random.seed(args.seed)
    data_iter = data(args.test, maxlines=args.maxl)
    print "sr: %s" % args.sr

    uid_cnt = defaultdict(int)
    ipid_cnt = defaultdict(int)
    adid_cnt = defaultdict(int)
    query_cnt = defaultdict(int)
    title_cnt = defaultdict(int)
    query_param_cnt = defaultdict(int)
    ad_param_cnt = defaultdict(int)

    for line_cnt, (data_type, rows, sinfo) in enumerate(data_iter):
        rows = filter(lambda x: filter_row(x, data_type, sr=args.sr), rows)
        if not rows:
            continue
        ipid, uid = map(int, (sinfo["IPID"], sinfo["UserID"]))
        uid_cnt[uid] += len(rows)
        ipid_cnt[ipid] += len(rows)

        query = unicode(sinfo["SearchQuery"], "utf-8")
        val = map(lambda x: hash_val(0, x), query.split())
        for v in val:
            query_cnt[v] += len(rows)

        sid = int(sinfo["SearchID"])
        for v in get_se_param(sid):
            query_param_cnt[v] += len(rows)

        for row in rows:
            aid = int(row["AdID"])
            adid_cnt[aid] += 1

            ad_info = get_ad_info(aid)
            for v in ad_info["Params"]:
                ad_param_cnt[v] += 1

            title = ad_info["Title"]
            title_val = map(lambda x: hash_val(0, x), title.split())
            for v in title_val:
                title_cnt[v] += 1
        if line_cnt % 100000 == 0:
            print "uid_cnt: %s" % len(uid_cnt)
            print "ipid_cnt: %s" % len(ipid_cnt)
            print "adid_cnt: %s" % len(adid_cnt)
            print "query_cnt: %s" % len(query_cnt)
            print "title_cnt: %s" % len(title_cnt)
            print "query_param_cnt: %s" % len(query_param_cnt)
            print "ad_param_cnt: %s" % len(ad_param_cnt)

    write_dump("data/uid_cnt.dump", uid_cnt)
    write_dump("data/ipid_cnt.dump", ipid_cnt)
    write_dump("data/adid_cnt.dump", adid_cnt)
    write_dump("data/query_cnt.dump", query_cnt)
    write_dump("data/title_cnt.dump", title_cnt)
    write_dump("data/query_param_cnt.dump", query_param_cnt)
    write_dump("data/ad_param_cnt.dump", ad_param_cnt)