Exemplo n.º 1
0
def gen_sub_by_para(drop_useless_pkg, drop_long):
    args = locals()
    lda_feature = get_lda_from_usage()
    feature = extend_feature(span_no=24,
                             input=lda_feature,
                             drop_useless_pkg=False,
                             drop_long=False)
    feature = extend_device_brand(feature)
    feature_label = attach_device_train_label(feature)

    train = feature_label[feature_label['sex'].notnull()]
    test = feature_label[feature_label['sex'].isnull()]

    X = train.drop(['sex', 'age', 'sex_age', 'device'], axis=1)
    Y = train['age']
    Y_CAT = pd.Categorical(Y)
    X_train, X_test, y_train, y_test = train_test_split(X, Y_CAT.labels)
    lgb_train = lgb.Dataset(X_train, label=y_train)
    lgb_eval = lgb.Dataset(X_test, y_test, reference=lgb_train)
    params = {
        'boosting_type': 'gbdt',
        'max_depth': 3,
        'random_state': 47,
        # "min_data_in_leaf":1000,
        'verbose': -1,
        'colsample_bytree': 0.58,
        # 'min_child_samples': 289,
        # 'min_child_weight': 0.1,
        'min_data_in_leaf': 1472,
        # 'num_leaves': 300,
        'reg_alpha': 3,
        'reg_lambda': 4,
        'subsample': 0.8
    }

    params_age = {
        'metric': {'multi_logloss'},
        'num_class': 11,
        'objective': 'multiclass',
    }

    params_sex = {
        'metric': ['auc', 'binary_logloss'],
        'objective': 'binary',
    }

    try:

        gbm = lgb.train(dict(params, **params_age),
                        lgb_train,
                        num_boost_round=1000,
                        valid_sets=lgb_eval,
                        early_stopping_rounds=50)
    except Exception as error:
        print(
            f'Model input columns:{list(X.columns)}\n dict({X.dtypes.sort_values()})'
        )
        raise error

    best = round(gbm.best_score.get('valid_0').get('multi_logloss'), 5)

    best = "{:.5f}".format(best)

    pre_x = test.drop(['sex', 'age', 'sex_age', 'device'], axis=1)
    sub = pd.DataFrame(
        gbm.predict(pre_x.values, num_iteration=gbm.best_iteration))

    # sub[1] = 1 - sub[0]
    # sub.head(3)

    sub.columns = Y_CAT.categories
    print(sub.columns)
    sub['DeviceID'] = test['device'].values
    sub = sub[[
        'DeviceID', '0', '1', '2', '3', '4', '5', '6', '7', '8', '9', '10'
    ]]
    sub.head(3)

    #lgb.plot_importance(gbm, max_num_features=20)

    print(
        f'=============Final train feature({len(feature_label.columns)}):\n{list(feature_label.columns)} \n {len(feature_label.columns)}'
    )

    file = f'./sub/baseline_age_{best}.csv'
    #print(f'sub file save to {file}')
    sub.to_csv(file, index=False)
Exemplo n.º 2
0
def gen_sub_by_para():
    args = locals()

    logger.debug(f'Run train dnn:{args}')
    # feature_label = get_dynamic_feature(svd_cmp)
    feature_label = get_stable_feature('1006')

    train=feature_label[feature_label['sex'].notnull()]
    test =feature_label[feature_label['sex'].isnull()]

    X = train.drop(['sex', 'age', 'sex_age', 'device'], axis=1)
    Y = train['sex_age']
    Y_CAT = pd.Categorical(Y)
    X_train, X_test, y_train, y_test = train_test_split(X, Y_CAT.labels)
    lgb_train = lgb.Dataset(X_train, label=y_train)
    lgb_eval = lgb.Dataset(X_test, y_test, reference=lgb_train)
    params = {
        'boosting_type': 'gbdt',

        'metric': {'multi_logloss'},
        'num_class': 22,
        'objective': 'multiclass',
        'random_state': 47,
        'verbose': -1,
        #'num_leaves':num_leaves,
        'max_depth': 3,
        #"min_data_in_leaf":1000,

        'feature_fraction': 0.2,
        'subsample': 0.4,
        'reg_alpha': 6,
        'reg_lambda': 4,
        # 'min_child_samples': 289,
        #'min_child_weight': 0.1,
        'min_data_in_leaf': 1472,
        #'num_leaves': 300,

        'learning_rate' : 0.01#,


    }


    try:

        gbm = lgb.train(params,
                lgb_train,

                num_boost_round=20000,
                valid_sets=[lgb_train, lgb_eval,],
                early_stopping_rounds=50,)

        print(f"Light GBM:{str(gbm)}")
    except Exception as error:
        print(f'Model input columns:{list(X.columns)}\n dict({X.dtypes.sort_values()})')
        raise error


    best_score = round(gbm.best_score.get('valid_1').get('multi_logloss'), 5)
    best_epoch = gbm.best_iteration

    pre_x=test.drop(['sex','age','sex_age','device'],axis=1)
    # sub=pd.DataFrame(gbm.predict(pre_x.values,num_iteration=gbm.best_iteration))
    #
    # sub.columns=Y_CAT.categories
    # sub['DeviceID']=test['device'].values
    # sub=sub[['DeviceID', '1-0', '1-1', '1-2', '1-3', '1-4', '1-5', '1-6', '1-7','1-8', '1-9', '1-10', '2-0', '2-1', '2-2', '2-3', '2-4', '2-5', '2-6', '2-7', '2-8', '2-9', '2-10']]
    # from sklearn.metrics import log_loss
    # loss_best = log_loss(y_test, gbm.predict(X_test,num_iteration=gbm.best_iteration))
    #
    # loss_default = log_loss(y_test, gbm.predict(X_test))
    #
    # print(f'Loss_best={loss_best}, Loss_default={loss_default}')
    # #lgb.plot_importance(gbm, max_num_features=20)
    #
    # # print(f'=============Final train feature({len(feature_label.columns)}):\n{list(feature_label.columns)} \n {len(feature_label.columns)}')
    #
    # file = f'./sub/baseline_lg_{best_score}_{best_epoch}_{args}.csv'
    # file = replace_invalid_filename_char(file)
    # print(f'sub file save to {file}')
    # sub.to_csv(file,index=False)

    train_bk = pd.DataFrame(gbm.predict(train.drop(['sex', 'age', 'sex_age', 'device'], axis=1)),
                            index=train.device,
                            columns=Y_CAT.categories
                            )

    test_bk = pd.DataFrame(gbm.predict(pre_x),
                           index=test.device,
                           columns=Y_CAT.categories
                           )

    from code_felix.tiny.util import save_result_for_ensemble
    save_result_for_ensemble(f'{best_score}_{best_epoch}_lgb_{args}',
                             train=train_bk,
                             test=test_bk,
                             label=None,
                             )
Exemplo n.º 3
0
def gen_sub_by_para(drop_feature):

    args = locals()

    logger.debug(f'Run train dnn:{args}')
    #feature_label = get_dynamic_feature(None)
    feature_label = get_stable_feature('1011')

    #feature_label = random_feature(feature_label, 1/2)

    feature_label = get_cut_feature(feature_label, drop_feature)

    #feature_label = get_best_feautre(feature_label)

    # daily_info = summary_daily_usage()
    # feature_label  = feature_label.merge(daily_info, on='device', how='left')

    train = feature_label[feature_label['sex'].notnull()]
    test = feature_label[feature_label['sex'].isnull()]

    X = train.drop(['sex', 'age', 'sex_age', 'device'], axis=1)
    Y = train['sex_age']
    Y_CAT = pd.Categorical(Y)
    X_train, X_test, y_train, y_test = train_test_split(X, Y_CAT.codes)

    gbm = get_model()
    logger.debug(f"Run the xgb with:{gpu_params}")
    # print(random_search.grid_scores_)
    gbm.fit(X, Y_CAT.codes, verbose=True)

    #results = gbm.evals_result()

    #print(results)
    #
    # best_epoch = np.array(results['validation_0']['mlogloss']).argmin() + 1
    # best_score = np.array(results['validation_1']['mlogloss']).min()
    #

    pre_x = test.drop(['sex', 'age', 'sex_age', 'device'], axis=1)

    print_imp_list(X_train, gbm)

    ###Save result for ensemble
    train_bk = pd.DataFrame(gbm.predict_proba(
        train.drop(['sex', 'age', 'sex_age', 'device'], axis=1)),
                            index=train.device,
                            columns=Y_CAT.categories)

    test_bk = pd.DataFrame(gbm.predict_proba(pre_x),
                           index=test.device,
                           columns=Y_CAT.categories)

    label_bk = pd.DataFrame(
        {'label': Y_CAT.codes},
        index=train.device,
    )

    save_result_for_ensemble(
        f'all_xgb_col_{len(feature_label.columns)}_{args}',
        train=train_bk,
        test=test_bk,
        label=label_bk,
    )
Exemplo n.º 4
0
def gen_sub_by_para():

    args = locals()

    logger.debug(f'Run train dnn:{args}')
    #feature_label = get_dynamic_feature(None)
    feature_label = get_stable_feature('1011')

    #feature_label = get_cut_feature(feature_label, drop_feature)

    train = feature_label[feature_label['sex'].notnull()]
    test = feature_label[feature_label['sex'].isnull()]

    X = train.drop(['sex', 'age', 'sex_age', 'device'], axis=1)
    Y = train['sex_age']
    Y_CAT = pd.Categorical(Y)
    X_train, X_test, y_train, y_test = train_test_split(X, Y_CAT.codes)

    gbm = get_model()
    logger.debug(f"Run the xgb with:{gpu_params}")
    # print(random_search.grid_scores_)
    gbm.fit(X_train,
            y_train,
            eval_set=[(X_train, y_train), (X_test, y_test)],
            early_stopping_rounds=50,
            verbose=True)

    results = gbm.evals_result()

    logger.debug(results)

    best_epoch = np.array(results['validation_1']['mlogloss']).argmin() + 1
    best_score = np.array(results['validation_1']['mlogloss']).min()

    logger.debug(f"Xgb arrive {best_score} at {best_epoch}")

    pre_x = test.drop(['sex', 'age', 'sex_age', 'device'], axis=1)

    print_imp_list(X_train, gbm)

    ###Save result for ensemble
    train_bk = pd.DataFrame(gbm.predict_proba(
        train.drop(['sex', 'age', 'sex_age', 'device'], axis=1)),
                            index=train.device,
                            columns=get_category().categories)

    test_bk = pd.DataFrame(gbm.predict_proba(pre_x),
                           index=test.device,
                           columns=get_category().categories)

    label_bk = pd.DataFrame(
        {'label': Y_CAT.codes},
        index=train.device,
    )

    save_result_for_ensemble(
        f'all_xgb_{args}',
        train=train_bk,
        test=test_bk,
        label=label_bk,
    )
Exemplo n.º 5
0
    for file in file_list:
        train, label, test = read_result_for_ensemble(file)

        train_list.append(train)
        if label is not None: label_list.append(label)
        test_list.append(test)

    train = pd.concat(train_list, axis=1)
    test = pd.concat(test_list, axis=1)
    label = label_list[0]


    train = train.sort_index()
    label = label.sort_index()

    X_train, X_test, y_train, y_test = train_test_split(train, label.iloc[:,0])

    #drop_list = list(np.arange(0.65, 0.7, 0.03))
   # drop_list.reverse()
    for dense in [128]:
      for drop_out in [0.63]:
        drop_out = round(drop_out, 2)
        patience=50
        lr = 0.0005
        #搭建融合后的模型
        inputs = Input((X_train.shape[1:]))

        x = Dropout(drop_out)(inputs)

        x = Dense(dense, activation='relu')(x)