Пример #1
0
def test_train_full_pipeline(
    tmpdir: LocalPath,
    dataset_path: str,
    categorical_features: List[str],
    numerical_features: List[str],
    target_col: str,
):
    expected_output_model_path = tmpdir.join("model.pkl")
    expected_metric_path = tmpdir.join("metrics.json")
    expected_report_path = tmpdir.join()
    params = TrainingPipelineParams(
        input_data_path=dataset_path,
        output_model_path=expected_output_model_path,
        metric_path=expected_metric_path,
        report_path=expected_report_path,
        splitting_params=SplittingParams(val_size=0.2, random_state=42),
        feature_params=FeatureParams(
            numerical_features=numerical_features,
            categorical_features=categorical_features,
            target_col=target_col,
        ),
        train_params=TrainingParams(model_type="KNeighborsClassifier"),
    )
    real_model_path, metrics = train_pipeline(params)
    assert metrics["auc"] > 0.5
def test_train_e2e(tmpdir: LocalPath, fake_dataset: str,
                   categorical_features: List[str],
                   numerical_features: List[str], target_col: str,
                   features_to_drop: List[str], config_test):
    categorical_features = list(
        set(categorical_features) - set(numerical_features))
    features_to_drop = list(set(features_to_drop) - set(numerical_features))
    expected_output_model_path = tmpdir.join("model.pkl")
    expected_metric_path = tmpdir.join("metrics.json")
    expected_transformer_path = tmpdir.join("transformer.pkl")
    for model_type in config_test.model_types:
        params = TrainingPipelineParams(
            input_data_path=fake_dataset,
            input_data_url="",
            output_model_path=expected_output_model_path,
            metric_path=expected_metric_path,
            transformer_path=expected_transformer_path,
            splitting_params=SplittingParams(
                val_size=config_test.splitting_val_size,
                random_state=config_test.splitting_random_state,
            ),
            feature_params=FeatureParams(
                numerical_features=numerical_features,
                categorical_features=categorical_features,
                target_col=target_col,
                features_to_drop=features_to_drop,
            ),
            train_params=TrainingParams(model_type=model_type),
        )
        real_model_path, metrics = train_pipeline(params)
        assert metrics["accuracy"] >= config_test.min_accuracy
        assert os.path.exists(real_model_path)
        assert os.path.exists(params.metric_path)
Пример #3
0
def params(tmpdir: LocalPath, numerical_features_yes: List[str],
           target_col: str):

    expected_train_data_path = tmpdir.join("train.csv")
    expected_model_path = tmpdir.join("models.pkl")
    expected_metric_path = tmpdir.join("metrics.json")
    expected_transformer_path = tmpdir.join("transformer.pkl")
    expected_source_data_path = tmpdir.join("source.csv")
    expected_result_data_path = tmpdir.join("result.csv")

    params = Params(
        report_path="",
        train_data_path=expected_train_data_path,
        model_path=expected_model_path,
        features_transformer_path=expected_transformer_path,
        metric_path=expected_metric_path,
        splitting_params=SplittingParams(val_size=0.2, random_state=239),
        train_params=TrainingParams(model_type="RandomForestClassifier"),
        feature_params=FeatureParams(categorical_features=None,
                                     numerical_features=numerical_features_yes,
                                     features_to_drop=None,
                                     target_col=target_col),
        inference_params=InferenceParams(
            source_data_path=expected_source_data_path,
            result_data_path=expected_result_data_path))
    return params
Пример #4
0
def test_train_e2e(
    tmpdir: LocalPath,
    dataset_path: str,
    categorical_features: List[str],
    numerical_features: List[str],
    target_col: str,
    features_to_drop: List[str],
):
    expected_output_model_path = tmpdir.join("model.pkl")
    expected_metric_path = tmpdir.join("metrics.json")
    expected_pretrained_model_path = expected_output_model_path
    expected_predictions_path = tmpdir.join("data/predicted/predictions.csv")
    params = TrainingPipelineParams(
        input_data_path=dataset_path,
        output_model_path=expected_output_model_path,
        metric_path=expected_metric_path,
        pretrained_model_path=expected_pretrained_model_path,
        predictions_path=expected_predictions_path,
        splitting_params=SplittingParams(val_size=0.2, random_state=1234),
        feature_params=FeatureParams(
            numerical_features=numerical_features,
            categorical_features=categorical_features,
            target_col=target_col,
            features_to_drop=features_to_drop,
            use_log_trick=False,
        ),
        train_params=TrainingParams(model_type="LogisticRegression"),
    )
    real_model_path, metrics = train_pipeline(params, LogisticRegression())
    assert metrics["roc_auc"] > 0
    assert os.path.exists(real_model_path)
    assert os.path.exists(params.metric_path)
def params(
        tmpdir: LocalPath,
        dataset_path: str,
        categorical_features: List[str],
        numerical_features: List[str],
        target_col: str
):
    expected_output_model_path = tmpdir.join("model.pkl")
    expected_metric_path = tmpdir.join("metrics.json")
    params = PipelineParams(
        train_data_path=dataset_path,
        data_for_pred_path=tmpdir.join('fake_data.csv'),
        predictions_path=tmpdir.join('predictions.csv'),
        transformer_path=tmpdir.join('transformer.pkl'),
        model_path=expected_output_model_path,
        metric_path=expected_metric_path,
        split_params=SplitParams(),
        features_params=FeatureParams(
            numerical=numerical_features,
            categorical=categorical_features,
            target=target_col,
        ),
        train_params=TrainParams(model_type="LogisticRegression", C=1, n_jobs=-1, penalty='l2'),
    )
    return params
def params(
    dataset_path: str,
    tmpdir: LocalPath,
    categorical_features_no: Optional[str],
    numerical_features_yes: List[str],
    target_col: str,
    features_to_drop_no: Optional[str],
):

    expected_output_model_path = tmpdir.join("models.pkl")
    expected_metric_path = tmpdir.join("metrics.json")
    expected_features_transformer_path = tmpdir.join(
        "features_transformer.pkl")

    params = Params(
        report_path="",
        train_data_path=dataset_path,
        model_path=expected_output_model_path,
        features_transformer_path=expected_features_transformer_path,
        metric_path=expected_metric_path,
        splitting_params=SplittingParams(val_size=0.2, random_state=239),
        train_params=TrainingParams(model_type="RandomForestClassifier"),
        feature_params=FeatureParams(
            numerical_features=numerical_features_yes,
            categorical_features=categorical_features_no,
            target_col=target_col,
            features_to_drop=features_to_drop_no),
        inference_params=InferenceParams(source_data_path="",
                                         result_data_path=""))

    return params
Пример #7
0
class ScriptData(object):
    def __init__(self, tmpdir):
        self.tmpdir = tmpdir
        self.tmpdata = None
        self.pristine_data = LocalPath(os.path.dirname(__file__)).join('data')
        self.installed_packages = None

    def copy_data(self):
        if self.tmpdata and self.tmpdata.exists():
            self.tmpdata.remove(ignore_errors=True)
        self.tmpdata = self.tmpdir.mkdir('data')
        self.pristine_data.copy(self.tmpdata, mode=True)

        # Can't add .git directories to the index
        git_no_scan = self.pristine_data.join('scripts/project/.hg')
        hg_no_scan = self.tmpdata.join('scripts/project/.git')
        git_no_scan.copy(hg_no_scan)

        return self.tmpdata

    def sysexec(self, script):
        print('Executing Script: %s' % script)
        return script.sysexec(cwd=str(script.dirpath()))

    def verify_data(self):
        if not self.tmpdata:
            return False

        for prissy in self.pristine_data.visit():
            assert prissy.ext != '.pyc', \
                'Pristine has Python bytecode indicating execution from pristine directory!'

            rel = prissy.relto(self.pristine_data)
            tmp = self.tmpdata.join(rel)

            if prissy.check(dir=True):
                assert tmp.check(dir=True), 'Data integirty test failed: %s' % rel
            elif prissy.check(file=True):
                assert tmp.check(file=True), 'Data integirty test failed: %s' % rel
                assert prissy.computehash() == tmp.computehash(), 'Hash mismatch: %s' % rel

        for tmp in self.tmpdata.visit():
            if '.git' in tmp.strpath or '__pycache__' in tmp.strpath or tmp.ext == '.pyc':
                continue

            rel = tmp.relto(self.tmpdata)
            prissy = self.pristine_data.join(rel)

            if tmp.check(dir=True):
                assert prissy.check(dir=True), 'Directory created in tmpdir: %s' % rel
            elif tmp.check(file=True):
                assert prissy.check(file=True), 'File created in tmpdir: %s' % rel

        return True

    def copy_installed(self):
        if not self.installed_packages:
            self.installed_packages = installed_packages()
        return copy.deepcopy(self.installed_packages)
Пример #8
0
def test_tmpdir(tmpdir: local.LocalPath) -> None:
    file_name: str = "hello.cpp"
    file_content: str = "#include <iostream>"
    p = tmpdir.join(file_name)
    p.write(file_content)

    assert tmpdir.join(file_name).isfile()
    assert p.read() == file_content
Пример #9
0
def test_client_private_key_path(
        patched_contract,
        monkeypatch: MonkeyPatch,
        sender_privkey: str,
        tmpdir: LocalPath,
        web3: Web3,
        channel_manager_address: str
):
    def check_permission_safety_patched(path: str):
        return True

    monkeypatch.setattr(
        microraiden.utils.private_key,
        'check_permission_safety',
        check_permission_safety_patched
    )

    privkey_file = tmpdir.join('private_key.txt')
    privkey_file.write(sender_privkey)

    with pytest.raises(AssertionError):
        Client(
            private_key='0xthis_is_not_a_private_key',
            channel_manager_address=channel_manager_address,
            web3=web3
        )

    with pytest.raises(AssertionError):
        Client(
            private_key='0xcorrect_length_but_still_not_a_private_key_12345678901234567',
            channel_manager_address=channel_manager_address,
            web3=web3
        )

    with pytest.raises(AssertionError):
        Client(
            private_key='/nonexisting/path',
            channel_manager_address=channel_manager_address,
            web3=web3
        )

    Client(
        private_key=sender_privkey,
        channel_manager_address=channel_manager_address,
        web3=web3
    )

    Client(
        private_key=sender_privkey[2:],
        channel_manager_address=channel_manager_address,
        web3=web3
    )

    Client(
        private_key=str(tmpdir.join('private_key.txt')),
        channel_manager_address=channel_manager_address,
        web3=web3
    )
Пример #10
0
async def test_send_file_path(tmpdir: LocalPath) -> None:
    app = Quart(__name__)
    file_ = tmpdir.join('send.img')
    file_.write('something')
    async with app.app_context():
        response = await send_file(Path(file_.realpath()))
    assert (await response.get_data(raw=True)) == file_.read_binary()
Пример #11
0
async def test_send_file_max_age(tmpdir: LocalPath) -> None:
    app = Quart(__name__)
    file_ = tmpdir.join('send.img')
    file_.write('something')
    async with app.app_context():
        response = await send_file(str(file_.realpath()))
    assert response.cache_control.max_age == app.send_file_max_age_default.total_seconds()
Пример #12
0
def fake_test_dataset_path(
    tmpdir: LocalPath,
    fake_test_dataset: pd.DataFrame,
) -> LocalPath:
    path = tmpdir.join("fake_test_data.csv")
    fake_test_dataset.to_csv(path, index=False)
    return path
Пример #13
0
def test_lockmanager_modifiedtime_updated(tmpdir: LocalPath):
    lockman = LockManager()

    try:
        lockman.lock(str(tmpdir))
    except OSError:
        raise
    else:
        lockfile: LocalPath = tmpdir.join(lockman.filename)

    assert exists(str(lockfile))

    # Don't bother specifying the exact type returned.
    # Since `LocalPath.mtime()` wraps `os.stat_result.st_mtime`,
    # the exact return type type may be OS dependant.
    pre_time = lockfile.mtime()
    sleep(1)
    try:
        lockman.lock(str(tmpdir))
    except OSError:
        raise
    else:
        post_time = lockfile.mtime()

    assert pre_time < post_time
Пример #14
0
async def test_send_file_as_attachment(tmpdir: LocalPath) -> None:
    app = Quart(__name__)
    file_ = tmpdir.join('send.img')
    file_.write('something')
    async with app.app_context():
        response = await send_file(Path(file_.realpath()), as_attachment=True)
    assert response.headers["content-disposition"] == "attachment; filename=send.img"
Пример #15
0
def make_hash_entries(
    tmpdir: LocalPath,
    entries: int,
    irrelevant_entries: int,
    nonexistant_entries: int,
    hashfilepath: Optional[LocalPath],
) -> LocalPath:
    """(Also makes cache files in the same directory.)"""
    hashfilepath = (tmpdir.join(syphon.core.check.DEFAULT_FILE)
                    if hashfilepath is None else hashfilepath)

    hash_content_list = []

    cache_file: LocalPath
    for cache_file in make_cache(hashfilepath.dirpath(), n=entries):
        hash_content_list.append(str(syphon.hash.HashEntry(cache_file)))

    random_file: LocalPath
    for random_file in make_random_file(hashfilepath.dirpath(),
                                        irrelevant_entries):
        hash_content_list.append(str(syphon.hash.HashEntry(random_file)))

    nonexistant_file: LocalPath
    for nonexistant_file in make_random_file(hashfilepath.dirpath(),
                                             nonexistant_entries):
        hash_content_list.append(str(syphon.hash.HashEntry(nonexistant_file)))
        nonexistant_file.remove()

    # Randomize the order of the generated hash file entries.
    if len(hash_content_list) > 0:
        hashfilepath.write("\n".join(randomize(*hash_content_list)))

    return hashfilepath
Пример #16
0
def fake_dataset_path(tmpdir: LocalPath,
                      seed: int = 100,
                      size: int = 10_000) -> LocalPath:
    df = make_dataset(seed=seed, size=size)
    fake = tmpdir.join("sample_fake.csv")
    df.to_csv(fake, index=False)
    return fake
Пример #17
0
def test_cpp_generator_puts_autogenerated_header(tmpdir: local.LocalPath, cpp_includes: str) -> None:
    _add_content(tmpdir, cpp_includes)
    CppGenerator(_get_file_path(tmpdir)).generate()
    verified: bool = False
    with open(tmpdir.join(_get_generated_file_name()), "r") as f:
        assert f.read() == '#include <iostream>\n\n#include "AUTOGENERATED.h"\n\n\nusing namespace std;\n\n'
        verified = True
    assert verified
Пример #18
0
async def test_send_file_mimetype(tmpdir: LocalPath) -> None:
    app = Quart(__name__)
    file_ = tmpdir.join('send.bob')
    file_.write('something')
    async with app.app_context():
        response = await send_file(Path(file_.realpath()), mimetype="application/bob")
    assert (await response.get_data(raw=True)) == file_.read_binary()
    assert response.headers["Content-Type"] == "application/bob"
Пример #19
0
async def test_send_file_last_modified_override(tmpdir: LocalPath) -> None:
    app = Quart(__name__)
    file_ = tmpdir.join('send.img')
    file_.write('something')
    last_modified = datetime(2015, 10, 10, tzinfo=timezone.utc)
    async with app.app_context():
        response = await send_file(str(file_.realpath()), last_modified=last_modified)
    assert response.last_modified == last_modified
Пример #20
0
def fake_dataset_path(tmpdir: LocalPath,
                      seed: int = 100,
                      size: int = 10_000) -> LocalPath:
    df = make_dataset(seed=seed, size=size)
    df.drop('target', axis=1, inplace=True, errors='ignore')
    print(df.columns)
    fake = tmpdir.join("sample_fake.csv")
    df.to_csv(fake, index=False)
    return fake
Пример #21
0
async def test_file_wrapper(tmpdir: LocalPath) -> None:
    file_ = tmpdir.join("file_wrapper")
    file_.write("abcdef")
    wrapper = FileBody(Path(file_.realpath()), buffer_size=3)
    results = []
    async with wrapper as response:
        async for data in response:
            results.append(data)
    assert results == [b"abc", b"def"]
Пример #22
0
async def test_send_file_last_modified(tmpdir: LocalPath) -> None:
    app = Quart(__name__)
    file_ = tmpdir.join('send.img')
    file_.write('something')
    async with app.app_context():
        response = await send_file(str(file_.realpath()))
    mtime = datetime.fromtimestamp(file_.mtime(), tz=timezone.utc)
    mtime = mtime.replace(microsecond=0)
    assert response.last_modified == mtime
def create_subdirectories(path, amount, depth):
    path = str(path)
    for x in range(amount):
        p = LocalPath(path).join(str(x))
        p.mkdir()
        _f = p.join("testreport.xml")
        _f.write("")
        if not depth == 0:
            depth -= 1
            create_subdirectories(p, 1, depth)
Пример #24
0
def test_serialize_model(tmpdir: LocalPath):
    expected_output = tmpdir.join("models.pkl")
    n_estimators = 10
    model = RandomForestClassifier(n_estimators=n_estimators)
    real_output = serialize_model(model, expected_output)
    assert real_output == expected_output
    assert os.path.exists(real_output)
    with open(real_output, "rb") as f:
        model = pickle.load(f)
    assert isinstance(model, RandomForestClassifier)
Пример #25
0
def simple_df(tmpdir: LocalPath) -> LocalPath:
    test_df = dedent("""
        h1,h2,h3,h4
        1,2,3,4
        5,6,7,8
        9,10,11,12
    """)
    sample = tmpdir.join("sample.csv")
    sample.write(test_df)
    return sample
Пример #26
0
def make_cache(directory: LocalPath,
               n: int = 1,
               name_formatter: Optional[str] = None) -> Iterator[LocalPath]:
    if name_formatter is None:
        name_formatter = "cache%d.csv"

    for i in range(n):
        cache: LocalPath = directory.join(name_formatter % i)
        cache.write(rand_string())
        yield cache
Пример #27
0
def test_serialize_model(tmpdir: LocalPath, config_test):
    for clf in config_test.model_types:
        clf = create_object_by_type(clf)
        expected_output = tmpdir.join("model.pkl")
        model = clf()
        real_output = serialize_model(model, expected_output)
        assert real_output == expected_output
        assert os.path.exists
        with open(real_output, "rb") as f:
            model = pickle.load(f)
        assert isinstance(model, clf)
Пример #28
0
def create_subdirectories(path, amount, depth):
    path = str(path)
    for x in range(amount):
        p = LocalPath(path).join(str(depth))
        p.mkdir()
        for ext in [".a", ".b", ".c", ".d", ""]:
            _f = p.join("testfile%s" % ext)
            _f.write("")
        if not depth == 0:
            depth -= 1
            create_subdirectories(p, 1, depth)
Пример #29
0
def test_lockmanager_lock_returns_lockfile(tmpdir: LocalPath):
    lockman = LockManager()

    try:
        actual_file: str = lockman.lock(str(tmpdir))
    except OSError:
        raise
    else:
        expected_file: LocalPath = tmpdir.join(lockman.filename)

    assert str(expected_file) == actual_file
Пример #30
0
def simple_feature_map_transformers(tmpdir: LocalPath) -> LocalPath:
    feature_to_transformer = dedent("""
    NoneTransformer:
      - h1
      - h2
      - h3
      - h4
    """)
    feat2trmers_test = tmpdir.join("feature_test.yaml")
    feat2trmers_test.write(feature_to_transformer)
    return feat2trmers_test
def train_df(tmpdir: LocalPath) -> pd.DataFrame:
    sample = dedent("""
        h1,h2,h3,h4
        1,2,3,4
        5,6,7,8
        9,10,11,12
        13,14,15,16
    """)
    csv = tmpdir.join("sample.csv")
    csv.write(sample)
    return pd.read_csv(csv)
Пример #32
0
def install_coverage(venv='venv'):
    venv = Path(venv)
    if not venv.exists():
        run('virtualenv', str(venv))
    run(str(venv.join('bin/python')), '-m', 'pip.__main__', 'install', '-r', str(COVERAGE_REQS))
Пример #33
0
# NOTE WELL: No side-effects are allowed in __init__ files. This means you!
from __future__ import absolute_import
from __future__ import print_function
from __future__ import unicode_literals

import os
from re import compile as Regex
from re import MULTILINE

from pip._internal.wheel import Wheel
from py._path.local import LocalPath as Path

TOP = Path(__file__) / '../../..'
COVERAGE_REQS = TOP.join('requirements.d/coverage.txt')


def requirements(reqs, path='requirements.txt'):
    """Write a requirements.txt file to the current working directory."""
    Path(path).write(reqs)


def run(*cmd, **env):
    if env:
        from os import environ
        tmp = env
        env = environ.copy()
        env.update(tmp)
    else:
        env = None

    from .capture_subprocess import capture_subprocess