def run(fold):
    # reading the data
    df = pd.read_csv(config.train_data_folds)

    num_cols = [
        'age', 'fnlwgt', 'capital.gain', 'capital.loss', 'hours.per.week'
    ]

    target_map = {'<=50K': 0, '>50K': 1}
    df.income = df.income.map(target_map)

    # cat cols
    cat_features = [
        f for f in df.columns if f not in num_cols + ['income', 'kfold']
    ]

    # feature engineering
    df = utils.feature_engineering(df, cat_features)

    # selecting the features
    features = [f for f in df.columns if f not in ['income', 'kfold']]

    # treating NANS
    for col in features:
        if col not in num_cols:
            df.loc[:, col] = df[col].astype(str).fillna('NONE')

    # label encoding
    for feat in features:
        if col not in num_cols:
            lbl_enc = preprocessing.LabelEncoder()
            lbl_enc.fit(df[feat])
            df.loc[:, feat] = lbl_enc.transform(df[feat])

    # splitting the data based on the folds created
    df_train = df[df.kfold != fold].reset_index(drop=True)
    df_valid = df[df.kfold == fold].reset_index(drop=True)

    x_train = df_train[features].values
    x_valid = df_valid[features].values

    # xgb
    model = xgb.XGBClassifier(n_jobs=-1, max_depth=7, n_estimators=200)

    model.fit(x_train, df_train.income.values)

    # AUC

    # taking the probability of 1
    valid_pred = model.predict_proba(x_valid)[:, 1]
    auc = metrics.roc_auc_score(df_valid.income.values, valid_pred)

    print('Fold: ', fold, 'Validation AUC: ', auc)
示例#2
0
def NB_model():
    train_data, test_data = utils.prepare_data()
    print('训练集有{}条记录。'.format(len(train_data)))
    print('测试集有{}条记录。'.format(len(test_data)))
    X_train, X_test = utils.feature_engineering(train_data, test_data)
    print('共有{}维特征。'.format(X_train.shape[1]))
    y_train = train_data['label'].values
    y_test = test_data['label'].values
    #数据建模
    nb_model = GaussianNB()
    nb_model.fit(X_train, y_train)
    y_pred = nb_model.predict(X_test)
    print('准确率:', accuracy_score(y_test, y_pred))
示例#3
0
def SVC_model():
    train_data, test_data = utils.prepare_data()
    print('训练集有{}条记录。'.format(len(train_data)))
    print('测试集有{}条记录。'.format(len(test_data)))
    X_train, X_test = utils.feature_engineering(train_data, test_data)
    print('共有{}维特征。'.format(X_train.shape[1]))
    y_train = train_data['label'].values
    y_test = test_data['label'].values
    #数据建模
    c_values = [0.0001, 1, 10000]
    for c_value in c_values:
        svm_model = SVC(C=c_value)
        svm_model.fit(X_train, y_train)
        y_pred = svm_model.predict(X_test)
        print('准确率:', accuracy_score(y_test, y_pred))
示例#4
0
    data_chunk = data_chunk[data_chunk.object_id != arr[len(arr) - 1]]
    data_chunk = data_chunk.reset_index(drop=True)

    meta_chunk = test_meta[test_meta['object_id'].isin(
        data_chunk['object_id'].unique())]
    meta_chunk = meta_chunk.reset_index(drop=True)

    g_data, eg_data, g_meta, eg_meta = utils.gal_split_data(
        data_chunk, meta_chunk, False)

    g_features = None
    eg_features = None

    if g_meta.shape[0] > 0:
        #make meta not drop object_id in the feature engineering function
        g_features = utils.feature_engineering(g_data, g_meta, False)
        if i_c == 0:
            g_features.to_csv('test_g_features.csv',
                              header=True,
                              mode='a',
                              index=False)
        else:
            g_features.to_csv('test_g_features.csv',
                              header=False,
                              mode='a',
                              index=False)

    if eg_meta.shape[0] > 0:

        eg_features = utils.feature_engineering(eg_data, eg_meta, False)
        if i_c == 0:
示例#5
0
import utils

#path_to_data = '/courses/cs342/Assignment2/'
path_to_data = ''

train, train_meta = utils.load_train(path_to_data)

g_train, eg_train, g_meta, eg_meta, g_target, eg_target = utils.gal_split_data(
    train, train_meta, True)

g_features = utils.feature_engineering(g_train, g_meta)
g_wtable, g_labels, g_classes, g_target_map = utils.preprocess_target(g_target)
g_features = utils.standardize_data(g_features)
utils.train_mlp(g_features, g_wtable, g_labels, g_classes, g_target_map, True)

eg_features = utils.feature_engineering(eg_train, eg_meta)
eg_wtable, eg_labels, eg_classes, eg_target_map = utils.preprocess_target(
    eg_target)
eg_features = utils.standardize_data(eg_features)
utils.train_mlp(eg_features, eg_wtable, eg_labels, eg_classes, eg_target_map,
                False)