Пример #1
0
def load_data(input_dir=None):
    """Loads the Ames Housing dataset

    Source::

        Decock, Dean. "Ames, Iowa: Alternative to the Boston Housing Data as an
        End of Semester Regression Project."
        <https://ww2.amstat.org/publications/jse/v19n3/decock.pdf>
        <https://s3.amazonaws.com/mit-dai-ballet/ames/DataDocumentation.txt >
    """
    if input_dir is not None:
        tables = conf.get('tables')

        entities_table_name = conf.get('data', 'entities_table_name')
        entities_config = one_or_raise(lwhere(tables, name=entities_table_name))
        X_df = load_table_from_config(input_dir, entities_config)

        targets_table_name = conf.get('data', 'targets_table_name')
        targets_config = one_or_raise(lwhere(tables, name=targets_table_name))
        y_df = load_table_from_config(input_dir, targets_config)
    else:
        source = 'https://s3.amazonaws.com/mit-dai-ballet/ames/AmesHousing.txt'
        df = pd.read_csv(source, sep='\t')
        X_df = df.drop('SalePrice', axis=1)
        y_df = df['SalePrice']

    return X_df, y_df
Пример #2
0
def push_changes(repo, user, feature, dry_run=False):
    branch_name = _make_branch_name(user, feature)
    origin = repo.remote('origin')
    refspec = '{branch}:{branch}'.format(branch=branch_name)
    if not dry_run:
        result = origin.push(refspec)
        push_info = one_or_raise(result)
        if not did_git_push_succeed(push_info):
            raise BalletError('Git push failed, '
                              'maybe you need to delete a branch on remote?')
    else:
        print('[dry run] would execute \'origin.push({refspec!r})\''.format(
            refspec=refspec))
def make_train_test_split(output_dir, seed=641137):
    # load and split data
    X, y = load_data()
    inds = X.index.copy()
    inds_tr, inds_te = train_test_split(inds,
                                        train_size=0.67,
                                        test_size=0.33,
                                        random_state=seed)
    X_tr_df = X.loc[inds_tr]
    X_te_df = X.loc[inds_te]
    y_tr_df = y.loc[inds_tr]
    y_te_df = y.loc[inds_te]

    # load config
    config = load_config()
    tables = config.get('data.tables')
    entities_table_name = config.get('data.entities_table_name')
    entities_config = one_or_raise(fy.lwhere(tables, name=entities_table_name))
    entities_path = entities_config['path']
    targets_table_name = config.get('data.targets_table_name')
    targets_config = one_or_raise(fy.lwhere(tables, name=targets_table_name))
    targets_path = targets_config['path']

    # prepare directories
    output_dir = pathlib.Path(output_dir)
    train_dir = output_dir.joinpath('train')
    train_dir.mkdir(exist_ok=True)
    test_dir = output_dir.joinpath('test')
    test_dir.mkdir(exist_ok=True)

    # save tables
    kwargs = {'header': True}
    X_tr_df.to_csv(train_dir.joinpath(entities_path), **kwargs)
    X_te_df.to_csv(test_dir.joinpath(entities_path), **kwargs)
    y_tr_df.to_csv(train_dir.joinpath(targets_path), **kwargs)
    y_te_df.to_csv(test_dir.joinpath(targets_path), **kwargs)

    return X_tr_df, X_te_df, y_tr_df, y_te_df
Пример #4
0
def get_diff_endpoints_from_commit_range(
        repo: git.Repo,
        commit_range: str) -> Tuple[git.Diffable, git.Diffable]:
    """Get endpoints of a diff given a commit range

    The resulting endpoints can be diffed directly::

        a, b = get_diff_endpoints_from_commit_range(repo, commit_range)
        a.diff(b)

    For details on specifying git diffs, see ``git diff --help``.
    For details on specifying revisions, see ``git help revisions``.

    Args:
        repo: Repo object initialized with project root
        commit_range: commit range as would be interpreted by ``git
            diff`` command. Unfortunately only patterns of the form ``a..b``
            and ``a...b`` are accepted. Note that the latter pattern finds the
            merge-base of a and b and uses it as the starting point for the
            diff.

    Returns:
        starting commit, ending commit (inclusive)

    Raises:
        ValueError: commit_range is empty or ill-formed

    See also:

        <https://stackoverflow.com/q/7251477>
    """
    if not commit_range:
        raise ValueError('commit_range cannot be empty')

    result = re_find(COMMIT_RANGE_REGEX, commit_range)
    if not result:
        raise ValueError(
            f'Expected diff str of the form \'a..b\' or \'a...b\' '
            f'(got {commit_range})')
    a, b = result['a'], result['b']
    a, b = repo.rev_parse(a), repo.rev_parse(b)
    if result['thirddot']:
        a = one_or_raise(repo.merge_base(a, b))
    return a, b