def load_data(input_dir=None): """Loads the Ames Housing dataset Source:: Decock, Dean. "Ames, Iowa: Alternative to the Boston Housing Data as an End of Semester Regression Project." <https://ww2.amstat.org/publications/jse/v19n3/decock.pdf> <https://s3.amazonaws.com/mit-dai-ballet/ames/DataDocumentation.txt > """ if input_dir is not None: tables = conf.get('tables') entities_table_name = conf.get('data', 'entities_table_name') entities_config = one_or_raise(lwhere(tables, name=entities_table_name)) X_df = load_table_from_config(input_dir, entities_config) targets_table_name = conf.get('data', 'targets_table_name') targets_config = one_or_raise(lwhere(tables, name=targets_table_name)) y_df = load_table_from_config(input_dir, targets_config) else: source = 'https://s3.amazonaws.com/mit-dai-ballet/ames/AmesHousing.txt' df = pd.read_csv(source, sep='\t') X_df = df.drop('SalePrice', axis=1) y_df = df['SalePrice'] return X_df, y_df
def push_changes(repo, user, feature, dry_run=False): branch_name = _make_branch_name(user, feature) origin = repo.remote('origin') refspec = '{branch}:{branch}'.format(branch=branch_name) if not dry_run: result = origin.push(refspec) push_info = one_or_raise(result) if not did_git_push_succeed(push_info): raise BalletError('Git push failed, ' 'maybe you need to delete a branch on remote?') else: print('[dry run] would execute \'origin.push({refspec!r})\''.format( refspec=refspec))
def make_train_test_split(output_dir, seed=641137): # load and split data X, y = load_data() inds = X.index.copy() inds_tr, inds_te = train_test_split(inds, train_size=0.67, test_size=0.33, random_state=seed) X_tr_df = X.loc[inds_tr] X_te_df = X.loc[inds_te] y_tr_df = y.loc[inds_tr] y_te_df = y.loc[inds_te] # load config config = load_config() tables = config.get('data.tables') entities_table_name = config.get('data.entities_table_name') entities_config = one_or_raise(fy.lwhere(tables, name=entities_table_name)) entities_path = entities_config['path'] targets_table_name = config.get('data.targets_table_name') targets_config = one_or_raise(fy.lwhere(tables, name=targets_table_name)) targets_path = targets_config['path'] # prepare directories output_dir = pathlib.Path(output_dir) train_dir = output_dir.joinpath('train') train_dir.mkdir(exist_ok=True) test_dir = output_dir.joinpath('test') test_dir.mkdir(exist_ok=True) # save tables kwargs = {'header': True} X_tr_df.to_csv(train_dir.joinpath(entities_path), **kwargs) X_te_df.to_csv(test_dir.joinpath(entities_path), **kwargs) y_tr_df.to_csv(train_dir.joinpath(targets_path), **kwargs) y_te_df.to_csv(test_dir.joinpath(targets_path), **kwargs) return X_tr_df, X_te_df, y_tr_df, y_te_df
def get_diff_endpoints_from_commit_range( repo: git.Repo, commit_range: str) -> Tuple[git.Diffable, git.Diffable]: """Get endpoints of a diff given a commit range The resulting endpoints can be diffed directly:: a, b = get_diff_endpoints_from_commit_range(repo, commit_range) a.diff(b) For details on specifying git diffs, see ``git diff --help``. For details on specifying revisions, see ``git help revisions``. Args: repo: Repo object initialized with project root commit_range: commit range as would be interpreted by ``git diff`` command. Unfortunately only patterns of the form ``a..b`` and ``a...b`` are accepted. Note that the latter pattern finds the merge-base of a and b and uses it as the starting point for the diff. Returns: starting commit, ending commit (inclusive) Raises: ValueError: commit_range is empty or ill-formed See also: <https://stackoverflow.com/q/7251477> """ if not commit_range: raise ValueError('commit_range cannot be empty') result = re_find(COMMIT_RANGE_REGEX, commit_range) if not result: raise ValueError( f'Expected diff str of the form \'a..b\' or \'a...b\' ' f'(got {commit_range})') a, b = result['a'], result['b'] a, b = repo.rev_parse(a), repo.rev_parse(b) if result['thirddot']: a = one_or_raise(repo.merge_base(a, b)) return a, b