def test_sample():
    ''' Generate a 500k random sample of the testing data '''
    filepath = dp("refactored/densified_test.feather")
    print("loading " + filepath)
    df = read_feather(filepath, columns=types.predictors)

    # Append the raw data index
    assert 'raw_data_index' not in df.columns
    assert df.shape[0] == N_TEST
    df['raw_data_index'] = N_TRAIN + np.array(range(N_TEST))

    print("Uniformly sampling 500k rows from the testing data ... ")
    np.random.seed(0)
    idx_random = np.random.permutation(500000)

    print("Featherizing sample ... ")
    df.iloc[idx_random].reset_index(drop=True).to_feather(
        dp("refactored/test_sample.feather"))
def split_train():
    filepath = dp("refactored/densified_train.feather")
    print("loading " + filepath)
    df = read_feather(filepath, columns=types.predictors + [types.response])
    # Append the raw data index
    assert 'raw_data_index' not in df.columns
    assert df.shape[0] == N_TRAIN
    df['raw_data_index'] = range(N_TRAIN)

    print("Splitting into subgroups ... ")
    np.random.seed(0)
    idx_random = np.random.permutation(N_TRAIN)
    idx_groups = np.array_split(idx_random, NGROUPS)

    for k, idxs in enumerate(idx_groups):
        sk = str(k)
        print("... featherizing group " + sk + " of " + str(NGROUPS))
        fname = dp("refactored/train_split_" + sk + ".feather")
        df.iloc[idxs].reset_index(drop=True).to_feather(fname)
def make_metadata(col):
    print("Beginning column " + col)
    newcol = 'new_' + col
    X = load_features(col)
    series = X[col]
    md = pd.DataFrame(series.value_counts(dropna=False))
    md['counts'] = md[col]
    md[col] = md.index
    if col in VERSION_COLS:
        idx_df = build_version_index(series)
        md = md.merge(idx_df, how='left')
        md.sort_values(newcol, inplace=True)
    else:
        md[newcol] = range(md.shape[0])
    md.to_csv(dp('metadata/' + col), index=False)
def refactor_col(infile, col):
    print(" ... " + col)
    series = read_feather(infile, columns=[col])[col]
    if col in PREDCOLS:
        metadata = pd.read_csv(dp("metadata/" + col)).drop('counts', axis=1)
        if col in FeaturesByType.categorical:
            df = pd.DataFrame({col: series})
            df['order'] = range(df.shape[0])
            df = df.merge(metadata, how='left').sort_values('order')
            newcol = 'new_' + col
            series = df[newcol].fillna(metadata.shape[0])
            values = series.astype(
                np.int64
            ).values  # Lightgbm treats this as missing for categorical features
        elif metadata[col].dtype.name in ['int64', 'float64']:
            values = series.values
        else:
            pdb.set_trace()
    else:
        values = series.values
    return values
示例#5
0
        models = [cols2density(df, cols) for cols in groups]
    print("Scoring on all groups ...")
    scores_df = pd.DataFrame({
        'pydens_' + str(k): models[k].density(df[groups[k]])
        for k in range(len(groups))
    })
    print("Concatenating ...")
    new_df = pd.concat([df, scores_df], axis=1)
    print("Feathering ...")
    new_df.to_feather(outfile)
    return models


# Constants
np.random.seed(0)
ALLCOLS = pd.read_csv(dp('raw/train.csv'), nrows=2).columns.tolist()
DENSIFIABLE_COLS = [
    a for a in ALLCOLS
    if ((a != 'HasDetections') and (a != 'MachineIdentifier'))
]
N = 25  # number of features to densify per run
N_GROUPS = 10
GROUPS = [select_group(DENSIFIABLE_COLS, N).tolist() for k in range(N_GROUPS)]

# Main
models = densify(infile=dp("refactored/train.feather"),
                 outfile=dp("refactored/densified_train.feather"),
                 groups=GROUPS)
_ = densify(infile=dp("refactored/test.feather"),
            outfile=dp("refactored/densified_test.feather"),
            groups=GROUPS,
示例#6
0
import numpy as np
import os
import pandas as pd
import pdb
import pickle
from time import time

from zpylib import datatools
from zpylib.learn.gbm import Lgbm
from zpylib import stats
from zpylib import data_path as dp
from zpylib import model_path as mp

# Constants
TRAIN_PATHS = [
    dp('refactored/' + st) for st in os.listdir(dp('refactored'))
    if 'train_' in st
]
TRAIN_PATHS.sort()
TEST_PATH = dp('refactored/test.feather')
MODELS_PATH = mp('rf_3feather.pkl')


def read_response(f):
    col = 'HasDetections'
    return read_feather(f, columns=[col])[col].values


def multi_read_response(files):
    return np.concatenate([read_response(f) for f in files])
示例#7
0
import numpy as np
import os
import pandas as pd
import pdb
import pickle
from time import time

from zpylib import datatools
from zpylib.learn.gbm import Lgbm
from zpylib import stats
from zpylib import data_path as dp
from zpylib import model_path as mp

# Constants
TRAIN_PATHS = [
    dp('refactored/' + st) for st in os.listdir(dp('refactored'))
    if 'train_' in st
]
TRAIN_PATHS.sort()
TEST_PATH = dp('refactored/test.feather')
SAMPLE_PATH = dp('submit/sample_submission.csv')
SUBMIT_PATH = dp('submit/submission.csv')
MODELS_PATH = mp('lgb_3feather.pkl')

EXTRA_PREDICTORS = ['pydens_' + str(k) for k in range(10)]
# KEEPERS = [
#     'SmartScreen', 'Census_OEMModelIdentifier', 'AvSigVersion', 'Census_FirmwareVersionIdentifier', 'CityIdentifier',
#     'continuous_AVProductStatesIdentifier', 'AVProductStatesIdentifier', 'CountryIdentifier',
#     'Census_ProcessorModelIdentifier', 'EngineVersion', 'AppVersion', 'Census_TotalPhysicalRAM', 'Census_OSVersion',
#     'Census_InternalPrimaryDiagonalDisplaySizeInInches', 'Wdft_IsGamer', 'Census_OSInstallTypeName', 'OsBuildLab',
#     'LocaleEnglishNameIdentifier', 'DefaultBrowsersIdentifier', 'IeVerIdentifier', 'GeoNameIdentifier',
def load_features(col):
    return read_feather(dp('raw/train.feather'), columns=[col])
示例#9
0
import pandas as pd

from zpylib import data_path as dp

def featherize(infile, outfile):
    print("read_csv-ing " + infile)
    df = pd.read_csv(infile, low_memory=False)
    print("writing " + outfile)
    df.to_feather(outfile)

featherize(dp('raw/train.csv'), dp('raw/train.feather'))
featherize(dp('raw/test.csv'), dp('raw/test.feather'))
示例#10
0
# # Identify categoricals that also appear important as continuous features
# df = lgb1.importance()
# important_continuous = set(df[df.gain > 1000].feature.tolist())
# catcont = list(set(data.coltypes.categorical).intersection(important_continuous))
# # Of these, leave as categorical those with fewer than four distinct values
# mdf = metadata.build_refactored_metadata()
# mdf = mdf[mdf.is_categorical==1]
# mdf = mdf[mdf['nunique'] > 3]
# mycatcont = list(set(catcont).intersection(set(mdf.colname.tolist())))
# "['" + "', '".join(mycatcont) + "']"
# pd.read_csv(dp('metadata/AVProductStatesIdentifier'))


def filter_categoricals_on_nunique(cols, lb=4):
    mdf = metadata.build_refactored_metadata()
    mdf = mdf[mdf.is_categorical == 1]
    mdf = mdf[mdf['nunique'] > 3]
    okcols = mdf.colname.tolist()
    return list(set(cols).intersection(set(okcols)))


dtypes = datatools.FeaturesByType()
df = datatools.read_feather(dp('refactored/train_split_0.feather'))
data = datatools.Data(df, select=dtypes.categorical)
lgb = train_one_gbm(data, cat=True)
top_categoricals = lgb.importance().head(10).feature.tolist()
filter_categoricals_on_nunique(top_categoricals)
pdb.set_trace()

pd.read_csv(dp('metadata/SmartScreen'))
示例#11
0
from feather import read_dataframe as read_feather
import numpy as np
import pdb

import pydens  # Install from github:https://github.com/zkurtz/pydens
import zpylib as zp
from zpylib import data_path as dp

target = 'HasDetections'
models_path = zp.model_path('lgb_3feather.pkl')
top_features = zp.model_loaders.which_top_features(models_path, N=100)
top_features_with_target = top_features + [target]

###############
## Load the data for the top features both for a sample of train and test
train_df = read_feather(dp("refactored/train_split_0.feather"),
                        columns=top_features_with_target).iloc[:500000]
test_df = read_feather(dp("refactored/test_sample.feather"),
                       columns=top_features)
train_data = zp.datatools.Data(train_df.drop(target, axis=1))
test_data = zp.datatools.Data(test_df)
cats = [
    f for f in train_data.X.columns if f in train_data.coltypes.categorical
]

###############
## Fit a density model
classifier = pydens.classifiers.lightgbm.Lgbm(categorical_features=cats,
                                              verbose=True)
num_dens_params = {'loner_min_count': 100, 'binning_params': {'max_bins': 20}}
cade = pydens.cade.Cade(classifier=classifier,
示例#12
0
from distutils.version import LooseVersion
import pandas as pd
import pdb

from zpylib import datatools
from zpylib import data_path as dp

cols = datatools.identify_version_features(
)  # + ['MachineIdentifier'] # see assertion below ...

print("Read version features in train.csv")
vftrain = pd.read_csv(dp("raw/train.csv"), usecols=cols + ['HasDetections'])
print("Read version features in test.csv")
vftest = pd.read_csv(dp("raw/test.csv"), usecols=cols)
#assert vftrain.shape[0] + vftest.shape[0] == pd.concat([vftrain.MachineIdentifier, vftest.MachineIdentifier]).nunique()
df = pd.concat([vftrain, vftest], ignore_index=True, sort=True)


def build_version_index(v):
    values = df[v].unique().tolist()
    values.sort(key=LooseVersion)
    return pd.DataFrame({v: values, 'idx': range(len(values))})


vmeta = {v: build_version_index(v) for v in cols}


def order_correlation(v):
    print(v)
    lkp = vmeta[v]
    x = vftrain[[v]]
示例#13
0
            newcol = 'new_' + col
            series = df[newcol].fillna(metadata.shape[0])
            values = series.astype(
                np.int64
            ).values  # Lightgbm treats this as missing for categorical features
        elif metadata[col].dtype.name in ['int64', 'float64']:
            values = series.values
        else:
            pdb.set_trace()
    else:
        values = series.values
    return values


def refactor(infile, outfile, expected_columns):
    print("### Loading " + infile)
    df = pd.DataFrame(
        {col: refactor_col(infile, col)
         for col in expected_columns})
    df.to_feather(outfile)


# Constants
ALLCOLS = pd.read_csv(dp('raw/train.csv'), nrows=2).columns.tolist()
ALLCOLS_EXCEPT_RESPONSE = [a for a in ALLCOLS if a != 'HasDetections']
PREDCOLS = train_colnames()

# Main
refactor(dp("raw/train.feather"), dp("refactored/train.feather"), ALLCOLS)
refactor(dp("raw/test.feather"), dp("refactored/test.feather"),
         ALLCOLS_EXCEPT_RESPONSE)