示例#1
0
def prepareSmallPickle():
    joined = du.loadPickleDF("application")
    small = joined[joined["dataclass"] == "Train"].iloc[:]
    small = small.drop(["dataclass"], axis=1)
    du.dfToPickle("small", small)
    skid = small[["SK_ID_CURR"]]
    du.dfToPickle("skidsmall", skid)
    bbsmall = du.loadPickleDF("bureau")
    bbsmall = skid.merge(bbsmall, how='left', on='SK_ID_CURR')
    bbsmall = bbsmall.drop(["dataclass"], axis=1)
    du.dfToPickle("bureausmall", bbsmall)

    print("done")
示例#2
0
def quickHack():
    """Tried to see if higher thresholds for feature inclusion in pruned data set, would result
    in last features added having improved contributions. Didnt seem so :?"""
    resultPath = "./output/eval_prune_3006"
    totalauc = pd.Series()
    totalbl = pd.Series()
    v = []
    last10 = []
    for i in range(0, 5):
        file = "result_" + "_" + str(i)
        result = du.loadPickleDF("result_" + str(i), path=resultPath)
        feats = result[result["INCL"] == True]["COL"].values
        print(file + ":" + str(len(feats)) + ":" +
              str(result.iloc[-1]["SCORE"]))
        v.append(result.iloc[-1]["SCORE"])
        rl10 = result.iloc[40:55]["RATIO"]
        last10.append(rl10.mean())
    print(str(np.mean(v)) + ":" + str(np.std(v)))
    print(str(np.mean(last10)) + ":" + str(np.std(last10)))
    print(
        str(totalauc.mean()) + " " + str(totalauc.std()) + " " +
        str(totalauc.mean() / totalauc.std()))

    resultPath = "./output/eval_prune_3106"
    totalauc = pd.Series()
    totalbl = pd.Series()
    for th in (0.2, 0.5, 1.0):
        v = []
        last10 = []
        for i in range(0, 5):
            file = "result_" + str(th) + "_" + str(i)
            result = du.loadPickleDF("result_" + str(th) + "_" + str(i),
                                     path=resultPath)
            feats = result[result["INCL"] == True]["COL"].values
            print(file + ":" + str(len(feats)) + ":" +
                  str(result.iloc[-1]["SCORE"]))
            v.append(result.iloc[-1]["SCORE"])
            rl10 = result.iloc[40:55]["RATIO"]
            last10.append(rl10.mean())
        print(str(np.mean(v)) + ":" + str(np.std(v)))
        print(str(np.mean(last10)) + ":" + str(np.std(last10)))
    print(
        str(totalauc.mean()) + " " + str(totalauc.std()) + " " +
        str(totalauc.mean() / totalauc.std()))
示例#3
0
def bureauBalanceLoanPredictPickle(num_rows=None, nan_as_category=True):
    # bureau = pd.read_csv('../data/csv/bureau.csv', nrows = num_rows)
    # bureau.to_pickle('../data/csv/bureau.pkl')
    bureau = pd.read_pickle('../data/pickle/bureau.pck')
    # bb = pd.read_csv('../data/csv/bureau_balance.csv', nrows = num_rows)
    # bb.to_pickle('../data/csv/bureau_balance.pkl')
    bb = pd.read_pickle('../data/pickle/bureau_balance.pck')
    joined = du.loadPickleDF("application")
    joined = joined[["SK_ID_CURR", "TARGET", "dataclass"]]
    # bureau = joined.merge(bureau, how='left',on="SK_ID_CURR")
    bureau = bureau.merge(joined, how='left', on="SK_ID_CURR")
    bureau = bureau[bureau["dataclass"] == "Train"]
    bureau = bureau.drop(["dataclass"], axis=1)
    ut.process_categories(bureau, ["TARGET"], checkCats=False)
    du.dfToPickle("bureausmall-loanpredict", bureau)
    print("done")
示例#4
0
def comparePrunedFeatureSet():
    resultPath = "./output/"

    import pickle
    evalidx = int(len(train) * 0.3)
    evaltrain = train[:evalidx]
    evaltarget = target[:evalidx]
    train = train[evalidx:]
    target = target[evalidx:]
    baseline = pd.Series()

    # for th in (0.2,0.5,1.0):
    #     for i in range(0,5):
    for th in (1.0, ):
        for i in range(2, 5):
            seed = randint(1, 60000)
            result = pd.DataFrame()
            result = fs.createRandomFeatureSet(train, target, params,
                                               scoreModel, seed, th)
            du.dfToPickle("result_" + str(th) + "_" + str(i), result,
                          resultPath)
    gc.collect()

    pruned = []
    baseline = []
    for th in (0.2, 0.5, 1.0):
        for i in range(0, 5):
            seed = randint(1, 60000)
            result = du.loadPickleDF("result_" + str(th) + "_" + str(i),
                                     path=resultPath)
            feats = result[result["INCL"] == True]["COL"].values
            baseline.append(
                evalLGBModelAUC(train, target, evaltrain, evaltarget, params,
                                seed))
            pruned.append(
                evalLGBModelAUC(train[feats], target, evaltrain[feats],
                                evaltarget, params, seed))
            pickle.dump(
                {
                    "baseline": baseline,
                    "pruned": pruned
                },
                open(resultPath + "eval_" + str(th) + "_" + str(i) + ".pck",
                     "wb"))
    print("------ done ")
示例#5
0
def bureauBalancePickle(num_rows=None, nan_as_category=True):

    bureau = pd.read_pickle('../data/pickle/bureau.pck')
    bb = pd.read_pickle('../data/pickle/bureau_balance.pck')

    bb, bb_cat = one_hot_encoder(bb, nan_as_category)
    bureau, bureau_cat = one_hot_encoder(bureau, nan_as_category)

    bureau = bureau[bureau["DAYS_CREDIT_UPDATE"] > -180]

    # Bureau balance: Perform aggregations and merge with bureau.csv
    bb_aggregations = {'MONTHS_BALANCE': ['min', 'max', 'size']}
    for col in bb_cat:
        bb_aggregations[col] = ['mean']
    bb_agg = bb.groupby('SK_ID_BUREAU').agg(bb_aggregations)
    bb_agg.columns = pd.Index(
        [e[0] + "_" + e[1].upper() for e in bb_agg.columns.tolist()])
    bureau = bureau.join(bb_agg, how='left', on='SK_ID_BUREAU')
    bureau.drop(['SK_ID_BUREAU'], axis=1, inplace=True)
    del bb, bb_agg
    gc.collect()

    # Bureau and bureau_balance numeric features
    num_aggregations = {
        'DAYS_CREDIT': ['min', 'max', 'mean', 'var'],
        'DAYS_CREDIT_ENDDATE': ['min', 'max', 'mean'],
        'DAYS_CREDIT_UPDATE': ['mean'],
        'CREDIT_DAY_OVERDUE': ['max', 'mean'],
        'AMT_CREDIT_MAX_OVERDUE': ['mean'],
        'AMT_CREDIT_SUM': ['max', 'mean', 'sum'],
        'AMT_CREDIT_SUM_DEBT': ['max', 'mean', 'sum'],
        'AMT_CREDIT_SUM_OVERDUE': ['mean'],
        'AMT_CREDIT_SUM_LIMIT': ['mean', 'sum'],
        'AMT_ANNUITY': ['max', 'mean'],
        'CNT_CREDIT_PROLONG': ['sum'],
        'MONTHS_BALANCE_MIN': ['min'],
        'MONTHS_BALANCE_MAX': ['max'],
        'MONTHS_BALANCE_SIZE': ['mean', 'sum']
    }
    # Bureau and bureau_balance categorical features
    cat_aggregations = {}
    for cat in bureau_cat:
        cat_aggregations[cat] = ['mean']
    for cat in bb_cat:
        cat_aggregations[cat + "_MEAN"] = ['mean']

    bureau_agg = bureau.groupby('SK_ID_CURR').agg({
        **num_aggregations,
        **cat_aggregations
    })
    bureau_agg.columns = pd.Index([
        'BURO_' + e[0] + "_" + e[1].upper()
        for e in bureau_agg.columns.tolist()
    ])
    # Bureau: Active credits - using only numerical aggregations
    active = bureau[bureau['CREDIT_ACTIVE_Active'] == 1]
    active_agg = active.groupby('SK_ID_CURR').agg(num_aggregations)
    active_agg.columns = pd.Index([
        'ACTIVE_' + e[0] + "_" + e[1].upper()
        for e in active_agg.columns.tolist()
    ])
    bureau_agg = bureau_agg.join(active_agg, how='left')
    del active, active_agg
    gc.collect()
    # Bureau: Closed credits - using only numerical aggregations
    closed = bureau[bureau['CREDIT_ACTIVE_Closed'] == 1]
    closed_agg = closed.groupby('SK_ID_CURR').agg(num_aggregations)
    closed_agg.columns = pd.Index([
        'CLOSED_' + e[0] + "_" + e[1].upper()
        for e in closed_agg.columns.tolist()
    ])
    bureau_agg = bureau_agg.join(closed_agg, how='left')
    del closed, closed_agg, bureau
    gc.collect()
    joined = du.loadPickleDF("application")
    joined = joined[["SK_ID_CURR", "TARGET", "dataclass"]]
    bureau_agg = joined.join(bureau_agg, how='left', on="SK_ID_CURR")
    du.dfToPickle("bureau", bureau_agg)
示例#6
0
    print("done")


du.csvdirToPickle("../data/csv")
applicationPickle()
bureauBalancePickle()
prepareSmallPickle()
bureauBalanceLoanPredictPickle()
debug = False
numBoostRounds = 5000
coreMax = 15

# train = du.loadPickle("traintest")
# train = du.loadPickleDF("small")
# train = du.loadPickleDF("bureausmall")
train = du.loadPickleDF("bureausmall-loanpredict")

# feats=["TARGET","dataclass",'BURO_DAYS_CREDIT_ENDDATE_MIN' ,'BURO_DAYS_CREDIT_ENDDATE_MAX', 'BURO_DAYS_CREDIT_ENDDATE_MEAN','ACTIVE_DAYS_CREDIT_ENDDATE_MIN', 'ACTIVE_DAYS_CREDIT_ENDDATE_MAX', 'ACTIVE_DAYS_CREDIT_ENDDATE_MEAN', 'CLOSED_DAYS_CREDIT_ENDDATE_MIN' ,'CLOSED_DAYS_CREDIT_ENDDATE_MAX', 'CLOSED_DAYS_CREDIT_ENDDATE_MEAN','BURO_MONTHS_BALANCE_MIN_MIN', 'BURO_MONTHS_BALANCE_MAX_MAX' ,'BURO_MONTHS_BALANCE_SIZE_MEAN', 'BURO_MONTHS_BALANCE_SIZE_SUM', 'ACTIVE_MONTHS_BALANCE_MIN_MIN', 'ACTIVE_MONTHS_BALANCE_MAX_MAX', 'ACTIVE_MONTHS_BALANCE_SIZE_MEAN','ACTIVE_MONTHS_BALANCE_SIZE_SUM','CLOSED_MONTHS_BALANCE_MIN_MIN', 'CLOSED_MONTHS_BALANCE_MAX_MAX' ,'CLOSED_MONTHS_BALANCE_SIZE_MEAN', 'CLOSED_MONTHS_BALANCE_SIZE_SUM',]
# feats=["TARGET","dataclass",'BURO_CNT_CREDIT_PROLONG_SUM','ACTIVE_CNT_CREDIT_PROLONG_SUM','CLOSED_CNT_CREDIT_PROLONG_SUM']
# train = train[feats]
# vis.visualizeCategorical(train)
# vis.visualizeNumerical(train)
# sys.exit(0)

if (debug):
    #train = train.iloc[:6000]
    train = train[[
        "SK_ID_CURR", "TARGET", "DAYS_BIRTH", "AMT_GOODS_PRICE", "AMT_ANNUITY",
        "DAYS_EMPLOYED", "CODE_GENDER", "DAYS_ID_PUBLISH"
    ]]
    numBoostRounds = 500
示例#7
0
import matplotlib.pyplot as plt
from sklearn import svm
import gc

import sys

import datatools.datastore_util as du
import datatools.model_util as modelUtil
import datatools.visualize as vis
import pandas as pd
import numpy as np
import util.preprocess_util as ut
import time
from datatools.transform_util import cart2sphere
#
train = du.loadPickleDF("small")
train = train[["TARGET", 'AMT_CREDIT', 'AMT_ANNUITY']][0:500]
train = train.dropna()
y = train["TARGET"]
feats = ['AMT_CREDIT', 'AMT_ANNUITY']
X = train[feats]

X = X.as_matrix()
y = y.as_matrix()
# we create clusters with 1000 and 100 points
# rng = np.random.RandomState(0)
# n_samples_1 = 1000
# n_samples_2 = 100
# X = np.r_[1.5 * rng.randn(n_samples_1, 2),
#           0.5 * rng.randn(n_samples_2, 2) + [2, 2]]
# y = [0] * (n_samples_1) + [1] * (n_samples_2)