def header(data): logging.info('HEADER') err_count = 0 # train and tournment csv files should have the same header if (data.header['train'] != data.header['tournament']).any(): err_count += 1 logging.warn('train and tournament csv files have different headers') # columns should be in the correct order. We are especially concerned with # the order of the features which should be feature1, feature2, ... and # not feature1, feature10, feature11, ... header = ['id', 'era', 'data_type'] header += ['feature' + str(i) for i in range(1, 51)] header += ['target'] for i in range(len(header)): err_count += _assert('header column', data.header['train'][i], '==', header[i]) # should have the correct number of columns actual = len(data.header['train']) desired = len(header) err_count += _assert('number of column in csv file', actual, '==', desired) return err_count
def ids(data): logging.info('IDS') err_count = 0 # duplicate ids num_duplicate = data.ID.size - np.unique(data.ID).size err_count += _assert('duplicate ids', num_duplicate, '==', 0) return err_count
def eras(data): logging.info('ERAS') err_count = 0 # number of eras target = {'train': 85, 'validation': 12, 'test': 1, 'live': 1} for region in target: n = np.unique(data.era[data.region == region]).size msg = 'number of eras in %s' % region err_count += _assert(msg, n, '==', target[region]) return err_count
def labels(data): logging.info('LABELS') err_count = 0 # labels should only contain 0 and 1 idx = data.nonmissing_label_index() y = data.y[idx] idx = np.logical_or(y == 0, y == 1) err_count += _assert("number of non 0, 1 labels", idx.size - idx.sum(), '==', 0) # test and live labels should be NaN regions = ['test', 'live'] for region in regions: idx = data.region == region y = data.y[idx] if not np.isnan(y).all(): err_count += 1 logging.warn("Some %s labels are not NaN" % region) # mean of labels and number of labels y_mean = [] for era, index in data.era_iter(): y = data.y[index] # labels are missing in eraX if era != 'eraX': msg = 'mean of labels in %s' % era.ljust(6) ym = y.mean() err_count += interval(msg, ym, [0.499, 0.501]) y_mean.append(ym) msg = 'num of labels in %s' % era.ljust(6) if era == 'eraX': limit = [270000, 280000] else: limit = [5920, 6800] err_count += interval(msg, y.size, limit) # label bias msg = 'fraction of eras with label mean less than half' y_mean = np.array(y_mean) err_count += interval(msg, (y_mean < 0.5).mean(), [0.4, 0.6]) return err_count
def features(data): logging.info('FEATURES') err_count = 0 # nonfinite feature values num = (~np.isfinite(data.x)).sum() err_count += _assert('nonfinite feature values', num, '==', 0) # abs correlation of features corr = np.corrcoef(data.x.T) corr = upper_triangle(corr) corr = np.abs(corr) err_count += interval('mean abs corr of features', corr.mean(), [0.18, 0.22]) err_count += interval('max abs corr of features', corr.max(), [0.72, 0.76]) # distribution of each feature in each era for era, feature_num, x in data.era_feature_iter(): msg = 'range of feature %2d in %s' % (feature_num, era.ljust(6)) err_count += array_interval(msg, x, [0, 1]) msg = 'mean of feature %2d in %s' % (feature_num, era.ljust(6)) err_count += interval(msg, x.mean(), [0.45, 0.551]) msg = 'std of feature %2d in %s' % (feature_num, era.ljust(6)) err_count += interval(msg, x.std(), [0.09, 0.15]) msg = 'skewn of feature %2d in %s' % (feature_num, era.ljust(6)) skew = ((x - x.mean())**3).mean() / x.std()**3 err_count += interval(msg, skew, [-0.44, 0.44]) msg = 'kurto of feature %2d in %s' % (feature_num, era.ljust(6)) kurt = ((x - x.mean())**4).mean() / x.std()**4 err_count += interval(msg, kurt, [2.45, 3.58]) return err_count