Пример #1
0
def header(data):

    logging.info('HEADER')

    err_count = 0

    # train and tournment csv files should have the same header
    if (data.header['train'] != data.header['tournament']).any():
        err_count += 1
        logging.warn('train and tournament csv files have different headers')

    # columns should be in the correct order. We are especially concerned with
    # the order of the features which should be feature1, feature2, ... and
    # not feature1, feature10, feature11, ...
    header = ['id', 'era', 'data_type']
    header += ['feature' + str(i) for i in range(1, 51)]
    header += ['target']
    for i in range(len(header)):
        err_count += _assert('header column', data.header['train'][i], '==',
                             header[i])

    # should have the correct number of columns
    actual = len(data.header['train'])
    desired = len(header)
    err_count += _assert('number of column in csv file', actual, '==', desired)

    return err_count
Пример #2
0
def ids(data):

    logging.info('IDS')

    err_count = 0

    # duplicate ids
    num_duplicate = data.ID.size - np.unique(data.ID).size
    err_count += _assert('duplicate ids', num_duplicate, '==', 0)

    return err_count
Пример #3
0
def eras(data):

    logging.info('ERAS')

    err_count = 0

    # number of eras
    target = {'train': 85, 'validation': 12, 'test': 1, 'live': 1}
    for region in target:
        n = np.unique(data.era[data.region == region]).size
        msg = 'number of eras in %s' % region
        err_count += _assert(msg, n, '==', target[region])

    return err_count
Пример #4
0
def labels(data):

    logging.info('LABELS')

    err_count = 0

    # labels should only contain 0 and 1
    idx = data.nonmissing_label_index()
    y = data.y[idx]
    idx = np.logical_or(y == 0, y == 1)
    err_count += _assert("number of non 0, 1 labels", idx.size - idx.sum(),
                         '==', 0)

    # test and live labels should be NaN
    regions = ['test', 'live']
    for region in regions:
        idx = data.region == region
        y = data.y[idx]
        if not np.isnan(y).all():
            err_count += 1
            logging.warn("Some %s labels are not NaN" % region)

    # mean of labels and number of labels
    y_mean = []
    for era, index in data.era_iter():

        y = data.y[index]

        # labels are missing in eraX
        if era != 'eraX':
            msg = 'mean of labels in %s' % era.ljust(6)
            ym = y.mean()
            err_count += interval(msg, ym, [0.499, 0.501])
            y_mean.append(ym)

        msg = 'num  of labels in %s' % era.ljust(6)
        if era == 'eraX':
            limit = [270000, 280000]
        else:
            limit = [5920, 6800]
        err_count += interval(msg, y.size, limit)

    # label bias
    msg = 'fraction of eras with label mean less than half'
    y_mean = np.array(y_mean)
    err_count += interval(msg, (y_mean < 0.5).mean(), [0.4, 0.6])

    return err_count
Пример #5
0
def features(data):

    logging.info('FEATURES')

    err_count = 0

    # nonfinite feature values
    num = (~np.isfinite(data.x)).sum()
    err_count += _assert('nonfinite feature values', num, '==', 0)

    # abs correlation of features
    corr = np.corrcoef(data.x.T)
    corr = upper_triangle(corr)
    corr = np.abs(corr)
    err_count += interval('mean abs corr of features', corr.mean(),
                          [0.18, 0.22])
    err_count += interval('max  abs corr of features', corr.max(),
                          [0.72, 0.76])

    # distribution of each feature in each era
    for era, feature_num, x in data.era_feature_iter():

        msg = 'range of feature %2d in %s' % (feature_num, era.ljust(6))
        err_count += array_interval(msg, x, [0, 1])

        msg = 'mean  of feature %2d in %s' % (feature_num, era.ljust(6))
        err_count += interval(msg, x.mean(), [0.45, 0.551])

        msg = 'std   of feature %2d in %s' % (feature_num, era.ljust(6))
        err_count += interval(msg, x.std(), [0.09, 0.15])

        msg = 'skewn of feature %2d in %s' % (feature_num, era.ljust(6))
        skew = ((x - x.mean())**3).mean() / x.std()**3
        err_count += interval(msg, skew, [-0.44, 0.44])

        msg = 'kurto of feature %2d in %s' % (feature_num, era.ljust(6))
        kurt = ((x - x.mean())**4).mean() / x.std()**4
        err_count += interval(msg, kurt, [2.45, 3.58])

    return err_count