def test_train_full_pipeline( tmpdir: LocalPath, dataset_path: str, categorical_features: List[str], numerical_features: List[str], target_col: str, ): expected_output_model_path = tmpdir.join("model.pkl") expected_metric_path = tmpdir.join("metrics.json") expected_report_path = tmpdir.join() params = TrainingPipelineParams( input_data_path=dataset_path, output_model_path=expected_output_model_path, metric_path=expected_metric_path, report_path=expected_report_path, splitting_params=SplittingParams(val_size=0.2, random_state=42), feature_params=FeatureParams( numerical_features=numerical_features, categorical_features=categorical_features, target_col=target_col, ), train_params=TrainingParams(model_type="KNeighborsClassifier"), ) real_model_path, metrics = train_pipeline(params) assert metrics["auc"] > 0.5
def test_train_e2e(tmpdir: LocalPath, fake_dataset: str, categorical_features: List[str], numerical_features: List[str], target_col: str, features_to_drop: List[str], config_test): categorical_features = list( set(categorical_features) - set(numerical_features)) features_to_drop = list(set(features_to_drop) - set(numerical_features)) expected_output_model_path = tmpdir.join("model.pkl") expected_metric_path = tmpdir.join("metrics.json") expected_transformer_path = tmpdir.join("transformer.pkl") for model_type in config_test.model_types: params = TrainingPipelineParams( input_data_path=fake_dataset, input_data_url="", output_model_path=expected_output_model_path, metric_path=expected_metric_path, transformer_path=expected_transformer_path, splitting_params=SplittingParams( val_size=config_test.splitting_val_size, random_state=config_test.splitting_random_state, ), feature_params=FeatureParams( numerical_features=numerical_features, categorical_features=categorical_features, target_col=target_col, features_to_drop=features_to_drop, ), train_params=TrainingParams(model_type=model_type), ) real_model_path, metrics = train_pipeline(params) assert metrics["accuracy"] >= config_test.min_accuracy assert os.path.exists(real_model_path) assert os.path.exists(params.metric_path)
def params(tmpdir: LocalPath, numerical_features_yes: List[str], target_col: str): expected_train_data_path = tmpdir.join("train.csv") expected_model_path = tmpdir.join("models.pkl") expected_metric_path = tmpdir.join("metrics.json") expected_transformer_path = tmpdir.join("transformer.pkl") expected_source_data_path = tmpdir.join("source.csv") expected_result_data_path = tmpdir.join("result.csv") params = Params( report_path="", train_data_path=expected_train_data_path, model_path=expected_model_path, features_transformer_path=expected_transformer_path, metric_path=expected_metric_path, splitting_params=SplittingParams(val_size=0.2, random_state=239), train_params=TrainingParams(model_type="RandomForestClassifier"), feature_params=FeatureParams(categorical_features=None, numerical_features=numerical_features_yes, features_to_drop=None, target_col=target_col), inference_params=InferenceParams( source_data_path=expected_source_data_path, result_data_path=expected_result_data_path)) return params
def test_train_e2e( tmpdir: LocalPath, dataset_path: str, categorical_features: List[str], numerical_features: List[str], target_col: str, features_to_drop: List[str], ): expected_output_model_path = tmpdir.join("model.pkl") expected_metric_path = tmpdir.join("metrics.json") expected_pretrained_model_path = expected_output_model_path expected_predictions_path = tmpdir.join("data/predicted/predictions.csv") params = TrainingPipelineParams( input_data_path=dataset_path, output_model_path=expected_output_model_path, metric_path=expected_metric_path, pretrained_model_path=expected_pretrained_model_path, predictions_path=expected_predictions_path, splitting_params=SplittingParams(val_size=0.2, random_state=1234), feature_params=FeatureParams( numerical_features=numerical_features, categorical_features=categorical_features, target_col=target_col, features_to_drop=features_to_drop, use_log_trick=False, ), train_params=TrainingParams(model_type="LogisticRegression"), ) real_model_path, metrics = train_pipeline(params, LogisticRegression()) assert metrics["roc_auc"] > 0 assert os.path.exists(real_model_path) assert os.path.exists(params.metric_path)
def params( tmpdir: LocalPath, dataset_path: str, categorical_features: List[str], numerical_features: List[str], target_col: str ): expected_output_model_path = tmpdir.join("model.pkl") expected_metric_path = tmpdir.join("metrics.json") params = PipelineParams( train_data_path=dataset_path, data_for_pred_path=tmpdir.join('fake_data.csv'), predictions_path=tmpdir.join('predictions.csv'), transformer_path=tmpdir.join('transformer.pkl'), model_path=expected_output_model_path, metric_path=expected_metric_path, split_params=SplitParams(), features_params=FeatureParams( numerical=numerical_features, categorical=categorical_features, target=target_col, ), train_params=TrainParams(model_type="LogisticRegression", C=1, n_jobs=-1, penalty='l2'), ) return params
def params( dataset_path: str, tmpdir: LocalPath, categorical_features_no: Optional[str], numerical_features_yes: List[str], target_col: str, features_to_drop_no: Optional[str], ): expected_output_model_path = tmpdir.join("models.pkl") expected_metric_path = tmpdir.join("metrics.json") expected_features_transformer_path = tmpdir.join( "features_transformer.pkl") params = Params( report_path="", train_data_path=dataset_path, model_path=expected_output_model_path, features_transformer_path=expected_features_transformer_path, metric_path=expected_metric_path, splitting_params=SplittingParams(val_size=0.2, random_state=239), train_params=TrainingParams(model_type="RandomForestClassifier"), feature_params=FeatureParams( numerical_features=numerical_features_yes, categorical_features=categorical_features_no, target_col=target_col, features_to_drop=features_to_drop_no), inference_params=InferenceParams(source_data_path="", result_data_path="")) return params
class ScriptData(object): def __init__(self, tmpdir): self.tmpdir = tmpdir self.tmpdata = None self.pristine_data = LocalPath(os.path.dirname(__file__)).join('data') self.installed_packages = None def copy_data(self): if self.tmpdata and self.tmpdata.exists(): self.tmpdata.remove(ignore_errors=True) self.tmpdata = self.tmpdir.mkdir('data') self.pristine_data.copy(self.tmpdata, mode=True) # Can't add .git directories to the index git_no_scan = self.pristine_data.join('scripts/project/.hg') hg_no_scan = self.tmpdata.join('scripts/project/.git') git_no_scan.copy(hg_no_scan) return self.tmpdata def sysexec(self, script): print('Executing Script: %s' % script) return script.sysexec(cwd=str(script.dirpath())) def verify_data(self): if not self.tmpdata: return False for prissy in self.pristine_data.visit(): assert prissy.ext != '.pyc', \ 'Pristine has Python bytecode indicating execution from pristine directory!' rel = prissy.relto(self.pristine_data) tmp = self.tmpdata.join(rel) if prissy.check(dir=True): assert tmp.check(dir=True), 'Data integirty test failed: %s' % rel elif prissy.check(file=True): assert tmp.check(file=True), 'Data integirty test failed: %s' % rel assert prissy.computehash() == tmp.computehash(), 'Hash mismatch: %s' % rel for tmp in self.tmpdata.visit(): if '.git' in tmp.strpath or '__pycache__' in tmp.strpath or tmp.ext == '.pyc': continue rel = tmp.relto(self.tmpdata) prissy = self.pristine_data.join(rel) if tmp.check(dir=True): assert prissy.check(dir=True), 'Directory created in tmpdir: %s' % rel elif tmp.check(file=True): assert prissy.check(file=True), 'File created in tmpdir: %s' % rel return True def copy_installed(self): if not self.installed_packages: self.installed_packages = installed_packages() return copy.deepcopy(self.installed_packages)
def test_tmpdir(tmpdir: local.LocalPath) -> None: file_name: str = "hello.cpp" file_content: str = "#include <iostream>" p = tmpdir.join(file_name) p.write(file_content) assert tmpdir.join(file_name).isfile() assert p.read() == file_content
def test_client_private_key_path( patched_contract, monkeypatch: MonkeyPatch, sender_privkey: str, tmpdir: LocalPath, web3: Web3, channel_manager_address: str ): def check_permission_safety_patched(path: str): return True monkeypatch.setattr( microraiden.utils.private_key, 'check_permission_safety', check_permission_safety_patched ) privkey_file = tmpdir.join('private_key.txt') privkey_file.write(sender_privkey) with pytest.raises(AssertionError): Client( private_key='0xthis_is_not_a_private_key', channel_manager_address=channel_manager_address, web3=web3 ) with pytest.raises(AssertionError): Client( private_key='0xcorrect_length_but_still_not_a_private_key_12345678901234567', channel_manager_address=channel_manager_address, web3=web3 ) with pytest.raises(AssertionError): Client( private_key='/nonexisting/path', channel_manager_address=channel_manager_address, web3=web3 ) Client( private_key=sender_privkey, channel_manager_address=channel_manager_address, web3=web3 ) Client( private_key=sender_privkey[2:], channel_manager_address=channel_manager_address, web3=web3 ) Client( private_key=str(tmpdir.join('private_key.txt')), channel_manager_address=channel_manager_address, web3=web3 )
async def test_send_file_path(tmpdir: LocalPath) -> None: app = Quart(__name__) file_ = tmpdir.join('send.img') file_.write('something') async with app.app_context(): response = await send_file(Path(file_.realpath())) assert (await response.get_data(raw=True)) == file_.read_binary()
async def test_send_file_max_age(tmpdir: LocalPath) -> None: app = Quart(__name__) file_ = tmpdir.join('send.img') file_.write('something') async with app.app_context(): response = await send_file(str(file_.realpath())) assert response.cache_control.max_age == app.send_file_max_age_default.total_seconds()
def fake_test_dataset_path( tmpdir: LocalPath, fake_test_dataset: pd.DataFrame, ) -> LocalPath: path = tmpdir.join("fake_test_data.csv") fake_test_dataset.to_csv(path, index=False) return path
def test_lockmanager_modifiedtime_updated(tmpdir: LocalPath): lockman = LockManager() try: lockman.lock(str(tmpdir)) except OSError: raise else: lockfile: LocalPath = tmpdir.join(lockman.filename) assert exists(str(lockfile)) # Don't bother specifying the exact type returned. # Since `LocalPath.mtime()` wraps `os.stat_result.st_mtime`, # the exact return type type may be OS dependant. pre_time = lockfile.mtime() sleep(1) try: lockman.lock(str(tmpdir)) except OSError: raise else: post_time = lockfile.mtime() assert pre_time < post_time
async def test_send_file_as_attachment(tmpdir: LocalPath) -> None: app = Quart(__name__) file_ = tmpdir.join('send.img') file_.write('something') async with app.app_context(): response = await send_file(Path(file_.realpath()), as_attachment=True) assert response.headers["content-disposition"] == "attachment; filename=send.img"
def make_hash_entries( tmpdir: LocalPath, entries: int, irrelevant_entries: int, nonexistant_entries: int, hashfilepath: Optional[LocalPath], ) -> LocalPath: """(Also makes cache files in the same directory.)""" hashfilepath = (tmpdir.join(syphon.core.check.DEFAULT_FILE) if hashfilepath is None else hashfilepath) hash_content_list = [] cache_file: LocalPath for cache_file in make_cache(hashfilepath.dirpath(), n=entries): hash_content_list.append(str(syphon.hash.HashEntry(cache_file))) random_file: LocalPath for random_file in make_random_file(hashfilepath.dirpath(), irrelevant_entries): hash_content_list.append(str(syphon.hash.HashEntry(random_file))) nonexistant_file: LocalPath for nonexistant_file in make_random_file(hashfilepath.dirpath(), nonexistant_entries): hash_content_list.append(str(syphon.hash.HashEntry(nonexistant_file))) nonexistant_file.remove() # Randomize the order of the generated hash file entries. if len(hash_content_list) > 0: hashfilepath.write("\n".join(randomize(*hash_content_list))) return hashfilepath
def fake_dataset_path(tmpdir: LocalPath, seed: int = 100, size: int = 10_000) -> LocalPath: df = make_dataset(seed=seed, size=size) fake = tmpdir.join("sample_fake.csv") df.to_csv(fake, index=False) return fake
def test_cpp_generator_puts_autogenerated_header(tmpdir: local.LocalPath, cpp_includes: str) -> None: _add_content(tmpdir, cpp_includes) CppGenerator(_get_file_path(tmpdir)).generate() verified: bool = False with open(tmpdir.join(_get_generated_file_name()), "r") as f: assert f.read() == '#include <iostream>\n\n#include "AUTOGENERATED.h"\n\n\nusing namespace std;\n\n' verified = True assert verified
async def test_send_file_mimetype(tmpdir: LocalPath) -> None: app = Quart(__name__) file_ = tmpdir.join('send.bob') file_.write('something') async with app.app_context(): response = await send_file(Path(file_.realpath()), mimetype="application/bob") assert (await response.get_data(raw=True)) == file_.read_binary() assert response.headers["Content-Type"] == "application/bob"
async def test_send_file_last_modified_override(tmpdir: LocalPath) -> None: app = Quart(__name__) file_ = tmpdir.join('send.img') file_.write('something') last_modified = datetime(2015, 10, 10, tzinfo=timezone.utc) async with app.app_context(): response = await send_file(str(file_.realpath()), last_modified=last_modified) assert response.last_modified == last_modified
def fake_dataset_path(tmpdir: LocalPath, seed: int = 100, size: int = 10_000) -> LocalPath: df = make_dataset(seed=seed, size=size) df.drop('target', axis=1, inplace=True, errors='ignore') print(df.columns) fake = tmpdir.join("sample_fake.csv") df.to_csv(fake, index=False) return fake
async def test_file_wrapper(tmpdir: LocalPath) -> None: file_ = tmpdir.join("file_wrapper") file_.write("abcdef") wrapper = FileBody(Path(file_.realpath()), buffer_size=3) results = [] async with wrapper as response: async for data in response: results.append(data) assert results == [b"abc", b"def"]
async def test_send_file_last_modified(tmpdir: LocalPath) -> None: app = Quart(__name__) file_ = tmpdir.join('send.img') file_.write('something') async with app.app_context(): response = await send_file(str(file_.realpath())) mtime = datetime.fromtimestamp(file_.mtime(), tz=timezone.utc) mtime = mtime.replace(microsecond=0) assert response.last_modified == mtime
def create_subdirectories(path, amount, depth): path = str(path) for x in range(amount): p = LocalPath(path).join(str(x)) p.mkdir() _f = p.join("testreport.xml") _f.write("") if not depth == 0: depth -= 1 create_subdirectories(p, 1, depth)
def test_serialize_model(tmpdir: LocalPath): expected_output = tmpdir.join("models.pkl") n_estimators = 10 model = RandomForestClassifier(n_estimators=n_estimators) real_output = serialize_model(model, expected_output) assert real_output == expected_output assert os.path.exists(real_output) with open(real_output, "rb") as f: model = pickle.load(f) assert isinstance(model, RandomForestClassifier)
def simple_df(tmpdir: LocalPath) -> LocalPath: test_df = dedent(""" h1,h2,h3,h4 1,2,3,4 5,6,7,8 9,10,11,12 """) sample = tmpdir.join("sample.csv") sample.write(test_df) return sample
def make_cache(directory: LocalPath, n: int = 1, name_formatter: Optional[str] = None) -> Iterator[LocalPath]: if name_formatter is None: name_formatter = "cache%d.csv" for i in range(n): cache: LocalPath = directory.join(name_formatter % i) cache.write(rand_string()) yield cache
def test_serialize_model(tmpdir: LocalPath, config_test): for clf in config_test.model_types: clf = create_object_by_type(clf) expected_output = tmpdir.join("model.pkl") model = clf() real_output = serialize_model(model, expected_output) assert real_output == expected_output assert os.path.exists with open(real_output, "rb") as f: model = pickle.load(f) assert isinstance(model, clf)
def create_subdirectories(path, amount, depth): path = str(path) for x in range(amount): p = LocalPath(path).join(str(depth)) p.mkdir() for ext in [".a", ".b", ".c", ".d", ""]: _f = p.join("testfile%s" % ext) _f.write("") if not depth == 0: depth -= 1 create_subdirectories(p, 1, depth)
def test_lockmanager_lock_returns_lockfile(tmpdir: LocalPath): lockman = LockManager() try: actual_file: str = lockman.lock(str(tmpdir)) except OSError: raise else: expected_file: LocalPath = tmpdir.join(lockman.filename) assert str(expected_file) == actual_file
def simple_feature_map_transformers(tmpdir: LocalPath) -> LocalPath: feature_to_transformer = dedent(""" NoneTransformer: - h1 - h2 - h3 - h4 """) feat2trmers_test = tmpdir.join("feature_test.yaml") feat2trmers_test.write(feature_to_transformer) return feat2trmers_test
def train_df(tmpdir: LocalPath) -> pd.DataFrame: sample = dedent(""" h1,h2,h3,h4 1,2,3,4 5,6,7,8 9,10,11,12 13,14,15,16 """) csv = tmpdir.join("sample.csv") csv.write(sample) return pd.read_csv(csv)
def install_coverage(venv='venv'): venv = Path(venv) if not venv.exists(): run('virtualenv', str(venv)) run(str(venv.join('bin/python')), '-m', 'pip.__main__', 'install', '-r', str(COVERAGE_REQS))
# NOTE WELL: No side-effects are allowed in __init__ files. This means you! from __future__ import absolute_import from __future__ import print_function from __future__ import unicode_literals import os from re import compile as Regex from re import MULTILINE from pip._internal.wheel import Wheel from py._path.local import LocalPath as Path TOP = Path(__file__) / '../../..' COVERAGE_REQS = TOP.join('requirements.d/coverage.txt') def requirements(reqs, path='requirements.txt'): """Write a requirements.txt file to the current working directory.""" Path(path).write(reqs) def run(*cmd, **env): if env: from os import environ tmp = env env = environ.copy() env.update(tmp) else: env = None from .capture_subprocess import capture_subprocess