Пример #1
0
dtypes = {
    'ip': 'uint32',
    'app': 'uint16',
    'device': 'uint16',
    'os': 'uint16',
    'channel': 'uint16',
    'is_attributed': 'uint8',
    'click_id': 'uint32'
}

import feature_engineerer
import gc

train_df = pd.read_csv('../input/train.csv', nrows=1000 * 100, dtype=dtypes)
feature_engineerer.do_feature_engineering(train_df)
len_train = len(train_df)

val_df = train_df[(len_train - 300000):len_train]
train_df = train_df[:(len_train - 300000)]
target = 'is_attributed'
predictors = ['app', 'device', 'os', 'channel', 'hour', 'hourly_click_count']
categorical = ['app', 'device', 'os', 'channel', 'hour']

print("Training...")
params = {
    'learning_rate': 0.1,
    #'is_unbalance': 'true', # replaced with scale_pos_weight argument
    'num_leaves': 1400,  # we should let it be smaller than 2^(max_depth)
    'max_depth': 3,  # -1 means no limit
    'min_child_samples':
Пример #2
0
import pandas as pd
import feature_engineerer

train = pd.read_csv('../input/train_first_1000k.csv')

feature_engineerer.do_feature_engineering(train)
print(train.describe())
print(train["ip"].nunique())
exit(0)

train.to_csv("../output/engineered_first_1000k.csv",
             float_format='%.6f',
             index=False)
import pandas as pd
import csv_loader
import feature_engineerer

dtypes = csv_loader.get_dtypes()
df = pd.read_csv('../input/test.csv', dtype=dtypes, nrows=100000)

feature_engineerer.do_feature_engineering(df)

print(df.head())
df.to_csv("../output/checking.csv", index=False)
Пример #4
0
import pandas as pd
import numpy as np
import pocket_lgb
from sklearn import model_selection
import feature_engineerer
import holdout_validator
import csv_loader

dtypes = csv_loader.get_dtypes()
input_df = pd.read_csv('../input/train_day3.csv', nrows=1000000, dtype=dtypes)

feature_engineerer.do_feature_engineering(input_df)
print(input_df.describe())

split_number = 5
skf = model_selection.KFold(n_splits=split_number)
lgb = pocket_lgb.GoldenLgb()
first_model = None
total_score = 0
for train_index, test_index in skf.split(input_df):
    train_np = input_df.iloc[train_index]
    test_np = input_df.iloc[test_index]
    train_df = pd.DataFrame(train_np)
    test_df = pd.DataFrame(test_np)

    model = lgb.do_train(train_df, test_df)
    score = model.best_score["valid_0"]["auc"]
    total_score += score

    if first_model is None:
        first_model = model
Пример #5
0
import pandas as pd
import feature_engineerer
import csv_loader

predictors = ['app', 'device', 'os', 'channel', 'hour', 'hourly_click_count']
categorical = ['app', 'device', 'os', 'channel', 'hour']
dtypes = csv_loader.get_dtypes()

test = pd.read_csv('../input/test.csv', dtype=dtypes, nrows=10000)
print(test.describe())
feature_engineerer.do_feature_engineering(test)
print(test.describe())

wtf = test[predictors]
print(wtf.describe())