def predictions(data): logging.info('PREDICTIONS') err_count = 0 # fit logistic regression model on train data idx = data.region == 'train' xtrain = data.x[idx] ytrain = data.y[idx] eratrain = data.era[idx] clf = LogisticRegression() clf.fit(xtrain, ytrain) # predict using train data yhat_train = clf.predict_proba(xtrain)[:, 1] # check train logloss and consistency logloss = log_loss(ytrain, yhat_train) err_count += interval('train logloss', logloss, [0.691, 0.693]) loglosses = logloss_by_era(eratrain, ytrain, yhat_train) consistency = (loglosses < np.log(2)).mean() err_count += interval('train consistency', consistency, [0.57, 0.84]) # predict using validation data yvalid, yhat = calc_yhat('validation', clf, data) # check validation logloss and consistency logloss = log_loss(yvalid, yhat) err_count += interval('validation logloss', logloss, [0.691, 0.693]) idx = data.region == 'validation' loglosses = logloss_by_era(data.era[idx], yvalid, yhat) consistency = (loglosses < np.log(2)).mean() err_count += interval('validation consistency', consistency, [0.5, 0.84]) # check test and live predictions for region in ('test', 'live'): y, yhat = calc_yhat(region, clf, data) target = [0.99 * yhat_train.min(), 1.01 * yhat_train.max()] msg = 'predictions in %s region' err_count += array_interval(msg % region, yhat, target) return err_count
def features(data): logging.info('FEATURES') err_count = 0 # nonfinite feature values num = (~np.isfinite(data.x)).sum() err_count += _assert('nonfinite feature values', num, '==', 0) # abs correlation of features corr = np.corrcoef(data.x.T) corr = upper_triangle(corr) corr = np.abs(corr) err_count += interval('mean abs corr of features', corr.mean(), [0.18, 0.22]) err_count += interval('max abs corr of features', corr.max(), [0.72, 0.76]) # distribution of each feature in each era for era, feature_num, x in data.era_feature_iter(): msg = 'range of feature %2d in %s' % (feature_num, era.ljust(6)) err_count += array_interval(msg, x, [0, 1]) msg = 'mean of feature %2d in %s' % (feature_num, era.ljust(6)) err_count += interval(msg, x.mean(), [0.45, 0.551]) msg = 'std of feature %2d in %s' % (feature_num, era.ljust(6)) err_count += interval(msg, x.std(), [0.09, 0.15]) msg = 'skewn of feature %2d in %s' % (feature_num, era.ljust(6)) skew = ((x - x.mean())**3).mean() / x.std()**3 err_count += interval(msg, skew, [-0.44, 0.44]) msg = 'kurto of feature %2d in %s' % (feature_num, era.ljust(6)) kurt = ((x - x.mean())**4).mean() / x.std()**4 err_count += interval(msg, kurt, [2.45, 3.58]) return err_count