Exemplo n.º 1
0
    for el in features:
        # drop columns with missing data as we cannot then calculate the rmse
        if sum(dset['df'][el].isnull()):
            features_drop.append(el)
    if len(features_drop):
        logger.info('dropping features {}'.format(features_drop))
        time.sleep(2)

    features = [el for el in features if el not in features_drop]

    data = dset['df'][features].values.astype(np.float)

    if is_normalize_0_1:
        data = normalize_array(data)

    X_missing = utilmlab.introduce_missing(data, p_miss, uniform_miss, miss_corr)
    df_missing = pd.DataFrame(X_missing, columns=features)

    df_missing = pd.DataFrame(X_missing, columns=features)
    if islabel:
        df_missing[labels] = df[labels]
    df_missing.to_csv(
        fn_missing_csv, index=False)
    df_data = pd.DataFrame(data, columns=features)
    if islabel:
        df_data[labels] = df[labels]
    df_data.to_csv(
        fn_csv, index=False)
    if fn_json is not None:
        with open(fn_json, "w") as f:
            dset_prop = {
        assert fn_i is not None
        assert label is not None
        logger.info('loading {} lbl:{} sep:{}'.format(
            fn_i, label, sep))
        df = pd.read_csv(fn_i, sep=sep)
        features = list(df.columns)
        assert label in features
        features.remove(label)
        assert len(df)
        assert len(features) > 1
        X_ = df[features]
        Y_ = df[label]

    if p_miss:
        X_ = pd.DataFrame(
            utilmlab.introduce_missing(X_, p_miss),
            columns=X_.columns)

    if nsample:
        nnan_x = utilmlab.df_get_num_na(X_)
        logger.info('+shape: x:{} y:{} nan: x:{} ({})'.format(
            X_.shape,
            Y_.shape,
            nnan_x,
            nnan_x/float(np.prod(X_.shape))
        ))
        X_ = X_.iloc[:nsample, :]
        Y_ = Y_[:nsample]

    nnan_x = utilmlab.df_get_num_na(X_)
    logger.info('shape: x:{} y:{} #{} nan: x:{} ({:0.3f})'.format(
    for el in features:
        # drop columns with missing data as we cannot then calculate the rmse
        if sum(dset['df'][el].isnull()):
            features_drop.append(el)
    if len(features_drop):
        logger.info('dropping features {}'.format(features_drop))
        time.sleep(2)

    features = [el for el in features if el not in features_drop]

    data = dset['df'][features].values.astype(np.float)

    if is_normalize_0_1:
        data = normalize_array(data)

    X_missing = utilmlab.introduce_missing(data, p_miss)

    df_missing = pd.DataFrame(X_missing, columns=features)

    df_missing = pd.DataFrame(X_missing, columns=features)
    if islabel:
        df_missing[labels] = df[labels]
    df_missing.to_csv(
        fn_missing_csv, index=False)
    df_data = pd.DataFrame(data, columns=features)
    if islabel:
        df_data[labels] = df[labels]
    df_data.to_csv(
        fn_csv, index=False)
    if fn_json is not None:
        with open(fn_json, "w") as f:
Exemplo n.º 4
0
    df = dset['df']
    logger.info(dset['targets'])
    assert len(dset['targets']) >= 1

    df_dst = df[features + dset['targets']]

    if label[0] not in dset['targets']:
        print(dset['targets'], label)
        print(df_dst.columns)
        df_dst[label] = df[dset['targets']]
        print(df.columns)
        df_dst = df_dst.drop(dset['targets'], axis=1)
        for el in dset['targets']:
            assert el not in df_dst.columns

    del df

    logger.info('{} {} o:{} lbl:{} pmiss:{}'.format(dataset,
                                                    df_dst.values.shape, fn_o,
                                                    label, p_miss))

    if nsample:
        df_dst = df_dst[:nsample]

    if p_miss:
        df_dst[features] = utilmlab.introduce_missing(df_dst[features], p_miss)
    assert fn_o is not None
    compression = 'gzip' if fn_o.endswith('.gz') else None
    logger.info('saving {} {}'.format(fn_o, list(df_dst.columns)))
    df_dst.to_csv(fn_o, index=False, compression=compression, sep=sep)
Exemplo n.º 5
0
        df = pd.read_csv(fn_i, sep=sep)
        features = list(df.columns)
        assert label in features
        features.remove(label)
        assert len(df)
        assert len(features) > 1
        X_ = df[features]
        Y_ = df[label]
        if fn_i_val_index is not None:
            X_val_indexes_ = pd.read_csv(fn_i_val_index, sep=sep)
            X_val_indexes_ = X_val_indexes_['eid']
        else:
            X_val_indexes_ = []

    if p_miss:
        X_ = pd.DataFrame(utilmlab.introduce_missing(X_, p_miss),
                          columns=X_.columns)

    if nsample:
        nnan_x = utilmlab.df_get_num_na(X_)
        logger.info('+shape: x:{} y:{} nan: x:{} ({})'.format(
            X_.shape, Y_.shape, nnan_x, nnan_x / float(np.prod(X_.shape))))
        X_ = X_.iloc[:nsample, :]
        Y_ = Y_[:nsample]

    nnan_x = utilmlab.df_get_num_na(X_)
    logger.info('shape: x:{} y:{} #{} nan: x:{} ({:0.3f})'.format(
        X_.shape, Y_.shape, len(set(Y_)), nnan_x, nnan_x / np.prod(X_.shape)))

    if nnan_x and use_gain:
        X_ = impute_gain(X_, odir)