def load_data(input_dir=None):
    """Loads the Ames Housing dataset

    Source::

        Decock, Dean. "Ames, Iowa: Alternative to the Boston Housing Data as an
        End of Semester Regression Project."
        <https://ww2.amstat.org/publications/jse/v19n3/decock.pdf>
        <https://s3.amazonaws.com/mit-dai-ballet/ames/DataDocumentation.txt >
    """
    if input_dir is not None:
        config = load_config()
        tables = config.get('data.tables')

        entities_table_name = config.get('data.entities_table_name')
        entities_config = one_or_raise(
            fy.lwhere(tables, name=entities_table_name))
        X_df = load_table_from_config(input_dir, entities_config)

        targets_table_name = config.get('data.targets_table_name')
        targets_config = one_or_raise(
            fy.lwhere(tables, name=targets_table_name))
        y_df = load_table_from_config(input_dir, targets_config)
    else:
        source = 'https://s3.amazonaws.com/mit-dai-ballet/ames/AmesHousing.txt'
        df = pd.read_csv(source, sep='\t')
        X_df = df.drop('SalePrice', axis=1)
        y_df = df['SalePrice']

    return X_df, y_df
示例#2
0
def load_data(
    input_dir=None,
    split='train',
) -> Tuple[pd.DataFrame, pd.DataFrame]:
    """Load data for a specific split

    If input_dir is not provided, loads X and y for the given split from the
    default location (S3). If input_dir is provided, loads the
    entities/targets tables from their default table names from the given
    directory, ignoring split.

    For feature development, only the train split should be used.
    """
    config = load_config()
    tables = config.data.tables
    entities_table_name = config.data.entities_table_name
    entities_config = some(where(tables, name=entities_table_name))
    targets_table_name = config.data.targets_table_name
    targets_config = some(where(tables, name=targets_table_name))

    if input_dir is None:
        bucket = config.data.s3_bucket
        split_path = config.data.splits.get(split)
        input_dir = f's3://{bucket}/{split_path}'

    X = load_table_from_config(input_dir, entities_config)
    y = load_table_from_config(input_dir, targets_config)

    return X, y
def load_data(input_dir=None):
    """Load data"""
    if input_dir is not None:
        config = load_config()
        tables = config.get('data.tables')

        entities_table_name = config.get('data.entities_table_name')
        entities_config = some(where(tables, name=entities_table_name))
        X = load_table_from_config(input_dir, entities_config)

        targets_table_name = config.get('data.targets_table_name')
        targets_config = some(where(tables, name=targets_table_name))
        y = load_table_from_config(input_dir, targets_config)
    else:
        root = 'https://mit-dai-ballet.s3.amazonaws.com/census'
        X = pd.read_csv(root + '/train/entities.csv.gz')
        y = pd.read_csv(root + '/train/targets.csv.gz')

    return X, y
示例#4
0
def load_data(split='train', input_dir=None):
    """Load data

    If input dir is not None, then load whatever dataset appears in
    `input_dir`. Otherwise, load the data split indicated by `split`.
    """
    if input_dir is not None:
        config = load_config()
        tables = config.data.tables

        entities_table_name = config.data.entities_table_name
        entities_config = some(where(tables, name=entities_table_name))
        X = load_table_from_config(input_dir, entities_config)

        targets_table_name = config.data.targets_table_name
        targets_config = some(where(tables, name=targets_table_name))
        y = load_table_from_config(input_dir, targets_config)
        return X, y

    raise NotImplementedError
def make_train_test_split(output_dir, seed=641137):
    # load and split data
    X, y = load_data()
    inds = X.index.copy()
    inds_tr, inds_te = train_test_split(inds,
                                        train_size=0.67,
                                        test_size=0.33,
                                        random_state=seed)
    X_tr_df = X.loc[inds_tr]
    X_te_df = X.loc[inds_te]
    y_tr_df = y.loc[inds_tr]
    y_te_df = y.loc[inds_te]

    # load config
    config = load_config()
    tables = config.get('data.tables')
    entities_table_name = config.get('data.entities_table_name')
    entities_config = one_or_raise(fy.lwhere(tables, name=entities_table_name))
    entities_path = entities_config['path']
    targets_table_name = config.get('data.targets_table_name')
    targets_config = one_or_raise(fy.lwhere(tables, name=targets_table_name))
    targets_path = targets_config['path']

    # prepare directories
    output_dir = pathlib.Path(output_dir)
    train_dir = output_dir.joinpath('train')
    train_dir.mkdir(exist_ok=True)
    test_dir = output_dir.joinpath('test')
    test_dir.mkdir(exist_ok=True)

    # save tables
    kwargs = {'header': True}
    X_tr_df.to_csv(train_dir.joinpath(entities_path), **kwargs)
    X_te_df.to_csv(test_dir.joinpath(entities_path), **kwargs)
    y_tr_df.to_csv(train_dir.joinpath(targets_path), **kwargs)
    y_te_df.to_csv(test_dir.joinpath(targets_path), **kwargs)

    return X_tr_df, X_te_df, y_tr_df, y_te_df
示例#6
0
def test_load_config_detect(mock_load_config_in_dir):
    path = pathlib.Path(__file__)
    load_config()
    mock_load_config_in_dir.assert_called_once_with(path)
示例#7
0
def load_background() -> pd.DataFrame:
    """Load all background data as a single dataframe"""
    config = load_config()
    bucket = config.data.tables.s3_bucket
    path = f's3://{bucket}/raw/background.csv.gz'
    return pd.read_csv(path, compression='gzip', index_col='challengeID')