Exemplo n.º 1
0
                set(train_df[train_df['user_id'].isin(
                    qtime_user_df['user_id'])]['user_id']))))

        train_x = train_df[train_df.columns.difference(
            ['user_id', 'item_id', 'label', 'truth_item_id'])].values
        train_y = train_df['label'].values

        valid_df = valid_df.sort_values('sim').reset_index(drop=True)
        valid_x = valid_df[valid_df.columns.difference(
            ['user_id', 'item_id', 'label', 'truth_item_id'])].values
        valid_y = valid_df['label'].values
        ''' 模型训练 '''
        time_str = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())
        print('------------------------ 模型训练 start time:{}'.format(time_str))
        # model = rank_rf(train_x, train_y)
        model = rank_xgb(train_x, train_y)
        one_train_auc = roc_auc_score(train_y,
                                      model.predict_proba(train_x)[:, 1])
        train_auc += one_train_auc
        print('train set: auc:{}'.format(one_train_auc))
        with open('./cache/model.pickle', 'wb') as f:
            pickle.dump(model, f)
        ''' 模型验证 '''
        time_str = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())
        print('------------------------ 模型验证 start time:{}'.format(time_str))
        pre_y = model.predict_proba(valid_x)[:, 1]
        one_valid_auc = roc_auc_score(valid_y, pre_y)
        valid_auc += one_valid_auc
        print('valid set: auc:{}'.format(one_valid_auc))
        answer = make_answer(valid_df[valid_df['label'] == 1], hot_df, phase=1)
Exemplo n.º 2
0
def _get_model(params):
    feature_df = params.get("feature")
    hot_df = params.get("hot_df")

    eta = None
    if 'eta' in params:
        eta = params['eta']
    min_child_weight = None
    if 'min_child_weight' in params:
        min_child_weight = params['min_child_weight']
    max_depth = None
    if 'max_depth' in params:
        max_depth = int(params['max_depth'])
    gamma = None
    if 'gamma' in params:
        gamma = params['gamma']
    subsample = None
    if 'subsample' in params:
        subsample = params['subsample']
    colsample_bytree = None
    if 'colsample_bytree' in params:
        colsample_bytree = params['colsample_bytree']
    reg_lambda = None
    if 'reg_lambda' in params:
        reg_lambda = params['reg_lambda']
    scale_pos_weight = None
    if 'scale_pos_weight' in params:
        scale_pos_weight = params['scale_pos_weight']
    tree_method = None
    if 'tree_method' in params:
        tree_method = params['tree_method']
    n_estimators = None
    if 'n_estimators' in params:
        n_estimators = int(params['n_estimators'])

    train_auc = valid_auc = 0
    pre_score_arr = np.zeros(5).reshape(-1, )
    rank_score_arr = np.zeros(5).reshape(-1, )
    for i in range(conf.k):
        ''' 训练集/验证集划分 '''
        train_df, valid_df = featuring.train_test_split(feature_df, seed=1)

        train_x = train_df[train_df.columns.difference(
            ['user_id', 'item_id', 'label'])].values
        train_y = train_df['label'].values

        valid_df = valid_df.sort_values('sim').reset_index(drop=True)
        valid_x = valid_df[valid_df.columns.difference(
            ['user_id', 'item_id', 'label'])].values
        valid_y = valid_df['label'].values
        ''' 模型训练 '''
        model = rank.rank_xgb(train_x,
                              train_y,
                              eta=eta,
                              min_child_weight=min_child_weight,
                              max_depth=max_depth,
                              gamma=gamma,
                              subsample=subsample,
                              colsample_bytree=colsample_bytree,
                              reg_lambda=reg_lambda,
                              scale_pos_weight=scale_pos_weight,
                              tree_method=tree_method,
                              n_estimators=n_estimators)
        one_train_auc = roc_auc_score(train_y,
                                      model.predict_proba(train_x)[:, 1])
        train_auc += one_train_auc
        ''' 模型验证 '''
        pre_y = model.predict_proba(valid_x)[:, 1]
        one_valid_auc = roc_auc_score(valid_y, pre_y)
        valid_auc += one_valid_auc
        answer = eval.make_answer(valid_df[valid_df['label'] == 1],
                                  hot_df,
                                  phase=1)

        pre_score_arr += eval.my_eval(list(valid_df['sim']),
                                      valid_df,
                                      answer,
                                      print_mark=False)
        rank_score_arr += eval.my_eval(pre_y,
                                       valid_df,
                                       answer,
                                       print_mark=False)

    avg_valid_auc = valid_auc / conf.k
    avg_pre_ndcg = pre_score_arr / conf.k
    avg_rank_ndcg = rank_score_arr / conf.k
    diff = avg_rank_ndcg - avg_pre_ndcg
    print('avg valid auc:{}, ndcg full gain:{}, ndcg half gain:{}'.format(
        avg_valid_auc, diff[0], diff[2]))

    return -diff[2]