示例#1
0
def main():
    train_iter = next_row(read_tsv("data/stream_%s.tsv"%args.sz))
    test_iter = iter([])
    sinfo_iter = read_tsv("data/sinfo_%s.tsv"%args.sz)
    del_keys_set = ["HistCTR", "SearchID", "ObjectType"]

    for t, (data_type, rows, sinfo) in enumerate(data(train_iter=train_iter, test_iter=test_iter, sinfo_iter=sinfo_iter)):
        uid = int(sinfo["UserID"])
        date_str = sinfo["SearchDate"]
        ts = convert_ts(datetime.strptime(date_str, "%Y-%m-%d %H:%M:%S.0"))

        rows = filter(lambda x : int(x["ObjectType"]) == 3, rows)
        for row in rows:
            for key in del_keys_set:
                del row[key]
            for key in row:
                row[key] = int(row[key]) if row[key] != "" else 0
        item = (
                ts,
                int(sinfo["SearchID"]),
                tuple([(row["AdID"], row["IsClick"], row["Position"]) for row in rows]),
                )
        uid_sid[uid].append(item)

    print "uid_sid: %s"%len(uid_sid)
    for uid in uid_sid:
        uid_sid[uid].sort()

    print "start user_cnt."
    file_name = "data/user_cnt_%s.csv"%args.sz
    with open(file_name, "w") as f:
        writer = DictWriter(f, fieldnames=["SearchID", "t_cnt", "bf_cnt", "af_cnt", "bf_3h_cnt", "af_3h_cnt", "bf_clk_cnt", "bag2", "bag1"])
        writer.writeheader()
        for uid in uid_sid:
            all_se = uid_sid[uid]
            writer.writerows(get_rows(all_se))
    os.system('sort -t"," -k1 -g -S 2G %s -o %s_sorted'%(file_name, file_name))

    print "start user_aid_cnt."
    file_name = "data/user_aid_cnt_%s.csv"%args.sz
    with open(file_name, "w") as f:
        writer = DictWriter(f, fieldnames=["SearchID", "AdID", "clk_cnt", "show_cnt", "t_show_cnt", "pos_clk_cnt", "pos_show_cnt"])
        writer.writeheader()
        for uid in uid_sid:
            all_se = uid_sid[uid]
            writer.writerows(get_aid_rows(uid, all_se))
    os.system('sort -t"," -k1 -g -S 2G %s -o %s_sorted'%(file_name, file_name))
示例#2
0
        def get_ad_info(aid):
            ad_info = db.ad_info.find_one({"AdID": aid})
            return trans_ad_info(ad_info)

    uid_cnt_dict = read_dump("data/uid_cnt.dump")
    adid_cnt_dict = read_dump("data/adid_cnt.dump")
    ipid_cnt_dict = read_dump("data/ipid_cnt.dump")
    query_cnt_dict = read_dump("data/query_cnt.dump")
    title_cnt_dict = read_dump("data/title_cnt.dump")
    query_param_cnt_dict = read_dump("data/query_param_cnt.dump")
    ad_param_cnt_dict = read_dump("data/ad_param_cnt.dump")

    user_info_map = get_user_info()
    category_map = get_category()
    user_cnt_iter = read_tsv("data/user_cnt.csv", delimiter=",")
    user_aid_cnt_iter = next_row(read_tsv("data/user_aid_cnt.csv", delimiter=","))
    main()













示例#3
0
def main():
    random.seed(args.seed)
    xgb_set = set([
        "price_pos",
        "ot1_cnt",
        "bf_cnt",
        "bf_clk_cnt",
        "u_aid_ctr",
        "record_cnt",
        "show_cnt",
        "clk_cnt",
        "t_cnt",
        "qe_w_pos",
        "HistCTR",
        "qe_ng_min_pos",
        "t_show_cnt",
        "bf_ctr",
        "ot2_cnt",
        "Price",
        "qe_ng_cnt",
        "title_len",
        "hl_ucnt",
        "price_ratio",
        "hl_lcnt",
        "t_match",
        "qe_w_ratio",
        "qe_ng_ratio",
        "Position",
        "bf_3h_cnt",
        "qe_w_cnt",
        "af_cnt",
        "ot3_cnt",
        "af_3h_cnt",
        "adid_cnt",
        "IsUserLoggedOn",
    ])
    xgb_sparse_set = set([
        "pos_ot_type", "pos_type", "ca_match", "ca_pid_match", "CategoryID",
        "s_LocationID", "s_CategoryID", "UserAgentFamilyID", "UserAgentOSID",
        "UserDeviceID", "UserAgentID", "UserID", "IPID", "AdID",
        "SearchParams", "Params", "Title", "SearchQuery"
    ])
    if args.test:
        fh_list = [
            open("data/tr_%s.%s" % (args.test, args.type), "w"),
            open("data/cv_%s.%s" % (args.test, args.type), "w"),
            open("data/te_%s.%s" % (args.test, args.type), "w")
        ]
    else:
        fh_list = [
            open("data/tr.%s" % (args.type), "w"),
            open("data/cv.%s" % (args.type), "w"),
            open("data/te.%s" % (args.type), "w")
        ]

    if args.sz is not None:
        train_iter = next_row(read_tsv("data/stream_%s.tsv" % args.sz))
        test_iter = iter([])
        sinfo_iter = read_tsv("data/sinfo_%s.tsv" % args.sz)
        data_iter = data(args.test,
                         train_iter=train_iter,
                         test_iter=test_iter,
                         sinfo_iter=sinfo_iter,
                         maxlines=args.maxl)
    else:
        data_iter = data(args.test, maxlines=args.maxl)

    print "sr: %s" % args.sr
    avg_ctr = defaultdict(lambda: [0, 0])
    for line_cnt, (data_type, rows, sinfo) in enumerate(data_iter):
        sinfo["s_LocationID"] = int(sinfo["LocationID"])
        sinfo["s_CategoryID"] = int(sinfo["CategoryID"])
        extract_slot_feas(rows, sinfo)
        rows = filter(lambda x: filter_row(x, data_type, sr=args.sr), rows)
        if not rows:
            continue
        feature_map = get_features(sinfo, rows, data_type > 0)
        instances = extract(feature_map)
        if line_cnt == 0:
            for k, feas in feature_map.items():
                print "-" * 80
                print k
                print feas[0].keys()
            feas_name = sorted(instances[0].keys())
            print len(feas_name), feas_name
            if args.sz is not None:
                write_dump("feas_name.dump", feas_name)
            elif args.test:
                write_dump("feas_name%s.dump" % args.test, feas_name)
            else:
                write_dump("feas_name.dump", feas_name)

        # date_str = sinfo["SearchDate"]
        # ts = convert_ts(datetime.strptime(date_str, "%Y-%m-%d %H:%M:%S.0"))

        fh = fh_list[data_type]
        for ins_map, row in zip(instances, rows):
            y = int(row.get("IsClick", 0))
            avg_ctr[data_type][0] += y
            avg_ctr[data_type][1] += 1
            ins = []
            for kt, k in enumerate(feas_name):
                if "xgb" in args.type:
                    if k in xgb_set:
                        hash_type = "xgb"
                    elif k in xgb_sparse_set:
                        hash_type = "xgb2"
                    else:
                        if line_cnt == 0:
                            print "drop %s" % k
                        continue
                else:
                    hash_type = ""
                feas = ins_map[k]
                if line_cnt == 0:
                    print kt, k, type(feas), feas
                if isinstance(feas, list) or isinstance(feas, tuple):
                    for f in feas:
                        ins.append(hash_val(kt + 1, f, hash_type))
                else:
                    ins.append(hash_val(kt + 1, feas, hash_type))
            fh.write(unicode(y) + " " + " ".join(map(unicode, ins)) + "\n")
    for key, value in avg_ctr.items():
        print "%s, %s" % (key, value[0] * 1. / value[1])
    for fh in fh_list:
        fh.close()
示例#4
0
    parser = argparse.ArgumentParser(description='Process some integers.')
    parser.add_argument('--test', type=int, default=0)
    parser.add_argument('--mongo', type=int, default=0)
    parser.add_argument('--sz', type=int, default=None)
    parser.add_argument('--maxl', type=int, default=1e6)
    parser.add_argument('--type', type=str, default="ins")
    parser.add_argument('--sr', type=float, default=0.1)
    parser.add_argument('--seed', type=int, default=9)
    parser.add_argument('--date', type=int, default=0)
    parser.add_argument('--log', type=int, default=1)
    args = parser.parse_args()

    if args.mongo:
        from pymongo import MongoClient
        import functools32 as functools
        client = MongoClient('localhost', 27017)
        db = client.test

        @functools.lru_cache(maxsize=1000000)
        def get_ad_info(aid):
            ad_info = db.ad_info.find_one({"AdID": aid})
            return trans_ad_info(ad_info)

    user_info_map = get_user_info()
    category_map = get_category()
    user_cnt_iter = read_tsv("data/user_cnt_%s.csv_sorted" % args.sz,
                             delimiter=",")
    user_aid_cnt_iter = next_row(
        read_tsv("data/user_aid_cnt_%s.csv_sorted" % args.sz, delimiter=","))
    main()
示例#5
0

if __name__ == '__main__':
    parser = argparse.ArgumentParser(description='Process some integers.')
    parser.add_argument('--test', type=int, default=0)
    parser.add_argument('--mongo', type=int, default=0)
    parser.add_argument('--sz', type=int, default=None)
    parser.add_argument('--maxl', type=int, default=1e6)
    parser.add_argument('--type', type=str, default="ins")
    parser.add_argument('--sr', type=float, default=0.1)
    parser.add_argument('--seed', type=int, default=9)
    args = parser.parse_args()

    if args.mongo:
        from pymongo import MongoClient
        import functools32 as functools
        client = MongoClient('localhost', 27017)
        db = client.test

        @functools.lru_cache(maxsize=1000000)
        def get_ad_info(aid):
            ad_info = db.ad_info.find_one({"AdID": aid})
            return trans_ad_info(ad_info)

    user_info_map = get_user_info()
    category_map = get_category()
    user_cnt_iter = read_tsv("data/user_cnt.csv", delimiter=",")
    user_aid_cnt_iter = next_row(
        read_tsv("data/user_aid_cnt.csv", delimiter=","))
    main()
示例#6
0
def main():
    random.seed(args.seed)
    xgb_set =set([
        "price_pos", "ot1_cnt", "bf_cnt", "bf_clk_cnt", 
        "u_aid_ctr", "record_cnt", "show_cnt", "clk_cnt", 
        "t_cnt", "qe_w_pos", "HistCTR", "qe_ng_min_pos", "t_show_cnt", 
        "bf_ctr", "ot2_cnt", "Price", 
        "qe_ng_cnt", "title_len", "hl_ucnt", 
        "price_ratio", "hl_lcnt", "t_match", "qe_w_ratio", 
        "qe_ng_ratio", "Position", 
        "bf_3h_cnt", "qe_w_cnt", 
        "af_cnt", "ot3_cnt",
        "af_3h_cnt", "adid_cnt", "IsUserLoggedOn",
        ])
    xgb_sparse_set = set([
        "pos_ot_type", "pos_type",
        "ca_match", "ca_pid_match",
        "CategoryID", "s_LocationID", "s_CategoryID",
        "UserAgentFamilyID", "UserAgentOSID", 
        "UserDeviceID", "UserAgentID",
        "UserID", "IPID", "AdID",
        "SearchParams", "Params", "Title", "SearchQuery"
        ])
    if args.test:
        fh_list = [ open("data/tr_%s.%s"%(args.test, args.type), "w"), 
                    open("data/cv_%s.%s"%(args.test, args.type), "w"), 
                    open("data/te_%s.%s"%(args.test, args.type), "w")]
    else:
        fh_list = [open("data/tr.%s"%(args.type), "w"), 
                    open("data/cv.%s"%(args.type), "w"), 
                    open("data/te.%s"%(args.type), "w")]

    if args.sz is not None:
        train_iter = next_row(read_tsv("data/stream_%s.tsv"%args.sz))
        test_iter = iter([])
        sinfo_iter = read_tsv("data/sinfo_%s.tsv"%args.sz)
        data_iter = data(args.test, train_iter=train_iter, test_iter=test_iter, sinfo_iter=sinfo_iter, maxlines=args.maxl)
    else:
        data_iter = data(args.test, maxlines=args.maxl)

    print "sr: %s"%args.sr
    avg_ctr = defaultdict(lambda : [0, 0])
    for line_cnt, (data_type, rows, sinfo) in enumerate(data_iter):
        sinfo["s_LocationID"] = int(sinfo["LocationID"])
        sinfo["s_CategoryID"] = int(sinfo["CategoryID"])
        extract_slot_feas(rows, sinfo)
        rows = filter(lambda x: filter_row(x, data_type, sr=args.sr), rows)
        if not rows:
            continue
        feature_map = get_features(sinfo, rows, data_type > 0)
        instances = extract(feature_map)
        if line_cnt == 0:
            for k, feas in feature_map.items():
                print "-" * 80
                print k
                print feas[0].keys()
            feas_name = sorted(instances[0].keys())
            print len(feas_name), feas_name
            if args.sz is not None:
                write_dump("feas_name.dump", feas_name)
            elif args.test:
                write_dump("feas_name%s.dump"%args.test, feas_name)
            else:
                write_dump("feas_name.dump", feas_name)

        # date_str = sinfo["SearchDate"]
        # ts = convert_ts(datetime.strptime(date_str, "%Y-%m-%d %H:%M:%S.0"))

        fh = fh_list[data_type]
        for ins_map, row in zip(instances, rows):
            y = int(row.get("IsClick", 0))
            avg_ctr[data_type][0] += y
            avg_ctr[data_type][1] += 1
            ins = []
            for kt, k in enumerate(feas_name):
                if "xgb" in args.type:
                    if k in xgb_set:
                        hash_type = "xgb"
                    elif k in xgb_sparse_set:
                        hash_type = "xgb2"
                    else:
                        if line_cnt == 0:
                            print "drop %s"%k
                        continue
                else:
                    hash_type = ""
                feas = ins_map[k]
                if line_cnt == 0:
                    print kt, k, type(feas), feas
                if isinstance(feas, list) or isinstance(feas, tuple):
                    for f in feas:
                        ins.append(hash_val(kt + 1, f, hash_type))
                else:
                    ins.append(hash_val(kt + 1, feas, hash_type))
            fh.write(unicode(y) + " " + " ".join(map(unicode, ins)) + "\n")
    for key, value in avg_ctr.items():
        print "%s, %s"%(key, value[0] * 1. / value[1])
    for fh in fh_list:
        fh.close()
示例#7
0
    args = parser.parse_args()

    if args.mongo:
        from pymongo import MongoClient
        import functools32 as functools
        client = MongoClient('localhost', 27017)
        db = client.test
        @functools.lru_cache(maxsize=1000000)
        def get_ad_info(aid):
            ad_info = db.ad_info.find_one({"AdID": aid})
            return trans_ad_info(ad_info)

    user_info_map = get_user_info()
    category_map = get_category()
    user_cnt_iter = read_tsv("data/user_cnt_%s.csv_sorted"%args.sz, delimiter=",")
    user_aid_cnt_iter = next_row(read_tsv("data/user_aid_cnt_%s.csv_sorted"%args.sz, delimiter=","))
    main()