Exemplo n.º 1
0
    def validate(self):
        """Collect and validate all new features"""

        changes = self.change_collector.collect_changes()

        features = []
        imported_okay = True
        for importer, modname, modpath in changes.new_feature_info:
            try:
                mod = importer()
                features.extend(_collect_contrib_features(mod))
            except (ImportError, SyntaxError):
                logger.info(f'Failed to import module at {modpath}')
                logger.exception('Exception details: ')
                imported_okay = False

        if not imported_okay:
            return False

        # if no features were added at all, reject
        if not features:
            logger.info('Failed to collect any new features.')
            return False

        return all(
            validate_feature_api(feature, self.X_df, self.y, False)
            for feature in features
        )
 def submit(self):
     user, feature = self.feature_queue.pop(0)
     logger.info(
         "Submitting: User {user:02d}, Feature {feature:02d}".format(
             user=user, feature=feature))
     submit_to_github(user, feature, str(self.feature_path), False, None,
                      True, True)
Exemplo n.º 3
0
def _prune_existing_features(project: Project,
                             force: bool = False) -> List[Feature]:
    """Prune existing features"""
    if not force and not project.on_master:
        raise SkippedValidationTest('Not on master')

    try:
        # if on master but not after merge, then we diff master with itself
        # and collect no features.
        proposed_feature = get_proposed_feature(project)
    except NoFeaturesCollectedError:
        raise SkippedValidationTest('No features collected')

    X_df, y_df = project.api.load_data()
    X_df_val, y_df_val = _load_validation_data(project)

    encoder = project.api.encoder
    y_val = encoder.fit(y_df).transform(y_df_val)

    features = project.api.features
    accepted_features = get_accepted_features(features, proposed_feature)

    pruner_class = _load_validator_class_params(project,
                                                'validation.feature_pruner')
    pruner = pruner_class(X_df, y_df, X_df_val, y_val, accepted_features,
                          proposed_feature)
    redundant_features = pruner.prune()

    # "propose removal"
    for feature in redundant_features:
        logger.info(PRUNER_MESSAGE + feature.source)

    return redundant_features
Exemplo n.º 4
0
def _log_recommended_reinstall():
    logger.info(
        'After a successful project template update, try re-installing the\n'
        'project in case the project template requires any different \n'
        'dependencies than what you have installed:\n'
        '\n'
        '    $ invoke install')
Exemplo n.º 5
0
 def judge(self):
     logger.info(f'Judging feature using {self}')
     outcomes = {
         accepter.__class__.__name__: accepter.judge()
         for accepter in self.accepters
     }
     logger.debug(f'Got outcomes {outcomes!r} from underlying accepters')
     return self.agg(outcomes.values())
Exemplo n.º 6
0
 def judge(self):
     logger.info(f'Judging feature using {self}')
     z = (self.candidate_feature.as_feature_engineering_pipeline().fit(
         self.X_df, y=self.y_df).transform(self.X_df_val))
     var = np.var(z, axis=0)
     delta = var - self.threshold
     outcome = np.all(delta > 0)
     logger.info(
         f'Feature variance is {var} vs. threshold {self.threshold} '
         f'({delta} above threshold)')
     return outcome
Exemplo n.º 7
0
def validate_feature_api(feature, X, y, subsample=False):
    logger.debug('Validating feature {feature!r}'.format(feature=feature))
    if subsample:
        X, y = subsample_data_for_validation(X, y)
    valid, failures = check_from_class(FeatureApiCheck, feature, X, y)
    if valid:
        logger.info('Feature is valid')
    else:
        logger.info(
            'Feature is NOT valid; failures were {failures}'
            .format(failures=failures))
    return valid
Exemplo n.º 8
0
 def judge(self):
     logger.info(f'Judging feature using {self}')
     z = (self.candidate_feature.as_feature_engineering_pipeline().fit(
         self.X_df, y=self.y_df).transform(self.X_df_val))
     y = self.y_val
     z, y = asarray2d(z), asarray2d(y)
     z, y = self._handle_nans(z, y)
     if z is None and y is None:
         # nans were found and handle_nan_targets == 'fail'
         return False
     mi = estimate_mutual_information(z, y)
     delta = mi - self.threshold
     outcome = delta > 0
     logger.info(f'Mutual information with target I(Z;Y) is {mi} vs. '
                 f'threshold {self.threshold} ({delta} above threshold)')
     return outcome
Exemplo n.º 9
0
    def fit(self, X, y, tune=True, **fit_kwargs):
        if tune:
            # do some tuning
            if btb is not None and self.tunables is not None:

                scorer = None

                def score(estimator):
                    scores = cross_val_score(estimator,
                                             X,
                                             y,
                                             scoring=scorer,
                                             cv=self.tuning_cv,
                                             fit_params=fit_kwargs)
                    return np.mean(scores)

                logger.info('Tuning model using BTB GP tuner...')
                tuner = btb.tuning.gp.GP(self.tunables)
                estimator = self._get_parent_instance()
                original_score = score(estimator)
                # TODO: this leads to an error because default value of
                # max_depth for RF is `None`
                # params = funcy.project(
                #     estimator.get_params(), [t[0] for t in self.tunables])
                # tuner.add(params, original_score)
                for i in range(self.tuning_iter):
                    params = tuner.propose()
                    estimator.set_params(**params)
                    score_ = score(estimator)
                    logger.debug('Iteration {}, params {}, score {}'.format(
                        i, params, score_))
                    tuner.add(params, score_)

                best_params = tuner._best_hyperparams
                best_score = tuner._best_score
                self.set_params(**best_params)
                logger.info(
                    'Tuning complete. '
                    'Cross val score changed from {:0.3f} to {:0.3f}.'.format(
                        original_score, best_score))
            else:
                logger.warning('Tuning requested, but either btb not '
                               'installed or tunable HyperParameters not '
                               'specified.')

        return super().fit(X, y, **fit_kwargs)
def configure_logging(output_dir):
    logger.setLevel(logging.DEBUG)

    handler = logging.FileHandler(output_dir.joinpath("info.log"))
    formatter = logging.Formatter(SIMPLE_LOG_FORMAT)
    handler.setFormatter(formatter)
    handler.setLevel(logging.INFO)
    logger.addHandler(handler)

    handler = logging.FileHandler(output_dir.joinpath("debug.log"))
    handler.setFormatter(formatter)
    handler.setLevel(logging.DEBUG)
    handler.addFilter(LevelFilter(logging.DEBUG))
    logger.addHandler(handler)

    logger.info("***BEGIN NEW SIMULATION SESSION***.")
    logger.debug("***BEGIN NEW SIMULATION SESSION***.")
Exemplo n.º 11
0
    def judge(self):
        logger.info('Judging Feature using {}'.format(self))
        feature_dfs_by_src = {}
        for feature in [self.candidate_feature] + self.features:
            feature_df = (
                feature.as_feature_engineering_pipeline().fit_transform(
                    self.X_df, self.y))
            feature_dfs_by_src[feature.source] = feature_df

        candidate_source = self.candidate_feature.source
        candidate_df = feature_dfs_by_src[candidate_source]
        n_samples, n_candidate_cols = feature_df.shape

        lmbda_1, lmbda_2 = _compute_lmbdas(self.lmbda_1, self.lmbda_2,
                                           feature_dfs_by_src)

        logger.info('Candidate Feature Shape: {}'.format(candidate_df.shape))
        omit_in_test = [''] + [f.source for f in self.features]
        for omit in omit_in_test:
            logger.debug('Testing with omitted feature: {}'.format(omit
                                                                   or 'None'))
            z = _concat_datasets(feature_dfs_by_src, n_samples,
                                 [candidate_source, omit])
            logger.debug('Calculating CMI of candidate feature:')
            cmi = estimate_conditional_information(candidate_df, self.y, z)
            logger.debug(
                'Conditional Mutual Information Score: {}'.format(cmi))
            cmi_omit = 0
            n_omit_cols = 0
            if omit:
                omit_df = feature_dfs_by_src[omit]
                _, n_omit_cols = omit_df.shape
                logger.debug('Calculating CMI of ommitted feature:')
                cmi_omit = estimate_conditional_information(omit_df, self.y, z)
                logger.debug('Omitted CMI Score: {}'.format(cmi_omit))
                logger.debug('Omitted Feature Shape: {}'.format(omit_df.shape))
            statistic = cmi - cmi_omit
            threshold = _compute_threshold(lmbda_1, lmbda_2, n_candidate_cols,
                                           n_omit_cols)
            logger.debug('Calculated Threshold: {}'.format(threshold))
            if statistic >= threshold:
                logger.debug('Succeeded while omitting feature: {}'.format(
                    omit or 'None'))
                return True
        return False
Exemplo n.º 12
0
    def prune(self):
        """Prune using GFSSF

        Uses lines 12-13 of agGFSSF
        """
        if np.isnan(self.y_val).any():
            raise ValueError(
                f'{self.__class__.__name__} does not support missing targets,'
                ' please use a different evaluator.')

        logger.info(f'Pruning features using {self}')

        feature_df_map = self._get_feature_df_map()
        lmbda_1, lmbda_2 = _compute_lmbdas(self.lmbda_1, self.lmbda_2,
                                           feature_df_map)

        logger.info(f'Recomputed lambda_1={lmbda_1}, lambda_2={lmbda_2}')

        redundant_features = []
        for candidate_feature in self.features:
            candidate_src = candidate_feature.source
            logger.debug(
                f'Trying to prune feature with source {candidate_src}')
            candidate_df = feature_df_map[candidate_feature]
            _, n_candidate_cols = candidate_df.shape
            z = _concat_datasets(feature_df_map, omit=[candidate_feature])
            logger.debug(CMI_MESSAGE)
            cmi = estimate_conditional_information(candidate_df, self.y_val, z)

            logger.debug(f'Conditional Mutual Information Score: {cmi}')
            statistic = cmi
            threshold = _compute_threshold(lmbda_1, lmbda_2, n_candidate_cols)
            logger.debug(f'Calculated Threshold: {threshold}')
            if statistic >= threshold:
                logger.debug(f'Passed, keeping feature {candidate_src}')
            else:
                # ballet.validation.main._prune_existing_features will log
                # this at level INFO
                logger.debug(
                    f'Failed, found redundant feature: {candidate_src}')
                del feature_df_map[candidate_feature]
                redundant_features.append(candidate_feature)
        return redundant_features
Exemplo n.º 13
0
    def _categorize_file_diffs(self, file_diffs):
        """Partition file changes into admissible and inadmissible changes"""
        # TODO move this into a new validator
        candidate_feature_diffs = []
        valid_init_diffs = []
        inadmissible_files = []

        for diff in file_diffs:
            valid, failures = check_from_class(ProjectStructureCheck, diff,
                                               self.project)
            if valid:
                if pathlib.Path(diff.b_path).parts[-1] != '__init__.py':
                    candidate_feature_diffs.append(diff)
                    logger.debug(
                        'Categorized {file} as CANDIDATE FEATURE MODULE'.
                        format(file=diff.b_path))
                else:
                    valid_init_diffs.append(diff)
                    logger.debug(
                        'Categorized {file} as VALID INIT MODULE'.format(
                            file=diff.b_path))
            else:
                inadmissible_files.append(diff)
                logger.debug('Categorized {file} as INADMISSIBLE; '
                             'failures were {failures}'.format(
                                 file=diff.b_path, failures=failures))

        logger.info('Admitted {} candidate feature{} '
                    'and {} __init__ module{} '
                    'and rejected {} file{}'.format(
                        len(candidate_feature_diffs),
                        make_plural_suffix(candidate_feature_diffs),
                        len(valid_init_diffs),
                        make_plural_suffix(valid_init_diffs),
                        len(inadmissible_files),
                        make_plural_suffix(inadmissible_files)))

        return candidate_feature_diffs, valid_init_diffs, inadmissible_files
Exemplo n.º 14
0
    def prune(self):
        feature_dfs_by_src = {}
        for accepted_feature in [self.candidate_feature] + self.features:
            accepted_df = accepted_feature.as_feature_engineering_pipeline(
            ).fit_transform(self.X_df, self.y)
            feature_dfs_by_src[accepted_feature.source] = accepted_df

        lmbda_1, lmbda_2 = _compute_lmbdas(self.lmbda_1, self.lmbda_2,
                                           feature_dfs_by_src)

        logger.info(
            "Pruning features using GFSSF: lambda_1={l1}, lambda_2={l2}".
            format(l1=lmbda_1, l2=lmbda_2))

        redundant_features = []
        for candidate_feature in self.features:
            candidate_src = candidate_feature.source
            logger.debug("Pruning feature: {}".format(candidate_src))
            candidate_df = feature_dfs_by_src[candidate_src]
            _, n_candidate_cols = candidate_df.shape
            z = _concat_datasets(feature_dfs_by_src, omit=candidate_src)
            logger.debug(CMI_MESSAGE)
            cmi = estimate_conditional_information(candidate_df, self.y, z)

            logger.debug(
                "Conditional Mutual Information Score: {}".format(cmi))
            statistic = cmi
            threshold = _compute_threshold(lmbda_1, lmbda_2, n_candidate_cols)
            logger.debug("Calculated Threshold: {}".format(threshold))
            if statistic >= threshold:
                logger.debug(
                    "Passed, keeping feature: {}".format(candidate_src))
            else:
                logger.debug("Failed, found redundant feature: {}".format(
                    candidate_src))
                del feature_dfs_by_src[candidate_src]
                redundant_features.append(candidate_feature)
        return redundant_features
Exemplo n.º 15
0
def _prune_existing_features(project, force=False):
    """Prune existing features"""
    if not force and not project.on_master_after_merge:
        raise SkippedValidationTest('Not on master')

    try:
        proposed_feature = get_proposed_feature(project)
    except NoFeaturesCollectedError:
        raise SkippedValidationTest('No features collected')

    out = project.build()
    X_df, y, features = out['X_df'], out['y'], out['features']
    accepted_features = get_accepted_features(features, proposed_feature)

    Pruner = load_class(project, 'validation.feature_pruner')
    pruner = Pruner(X_df, y, accepted_features, proposed_feature)
    redundant_features = pruner.prune()

    # "propose removal"
    for feature in redundant_features:
        logger.info(PRUNER_MESSAGE + feature.source)

    return redundant_features
Exemplo n.º 16
0
    def _categorize_file_diffs(
        self, file_diffs: git.DiffIndex
    ) -> Tuple[List[git.Diff], List[git.Diff], List[git.Diff]]:
        """Partition file changes into admissible and inadmissible changes"""
        # TODO move this into a new validator
        candidate_feature_diffs = []
        valid_init_diffs = []
        inadmissible_files = []

        for diff in file_diffs:
            valid, failures, _ = check_from_class(ProjectStructureCheck, diff,
                                                  self.project)
            if valid:
                if pathlib.Path(diff.b_path).parts[-1] != '__init__.py':
                    candidate_feature_diffs.append(diff)
                    logger.debug(f'Categorized {diff.b_path} as '
                                 'CANDIDATE FEATURE MODULE')
                else:
                    valid_init_diffs.append(diff)
                    logger.debug(
                        f'Categorized {diff.b_path} as VALID INIT MODULE')
            else:
                inadmissible_files.append(diff)
                logger.debug(f'Categorized {diff.b_path} as INADMISSIBLE; '
                             f'failures were {failures}')

        logger.info('Admitted {n1} candidate feature{s1} '
                    'and {n2} __init__ module{s2} '
                    'and rejected {n3} file{s3}'.format(
                        n1=len(candidate_feature_diffs),
                        s1=make_plural_suffix(candidate_feature_diffs),
                        n2=len(valid_init_diffs),
                        s2=make_plural_suffix(valid_init_diffs),
                        n3=len(inadmissible_files),
                        s3=make_plural_suffix(inadmissible_files)))

        return candidate_feature_diffs, valid_init_diffs, inadmissible_files
def test_validation_end_to_end(quickstart):
    project = quickstart.project
    modname = 'foo'
    base = project.path
    repo = project.repo

    def _import(modname):
        relpath = modname_to_relpath(modname,
                                     project_root=base,
                                     add_init=False)
        abspath = base.joinpath(relpath)
        return import_module_at_path(modname, abspath)

    foo = _import('foo')
    assert isinstance(foo, ModuleType)

    foo_features = _import('foo.features')
    assert isinstance(foo_features, ModuleType)

    collect_contrib_features = foo_features.collect_contrib_features
    features = collect_contrib_features()
    assert len(features) == 0

    # first providing a mock feature, call build
    with patch.object(
        foo_features, 'collect_contrib_features',
        return_value=[Feature(input='A_1', transformer=IdentityTransformer())]
    ):
        X_df = pd.util.testing.makeCustomDataframe(5, 2)
        X_df.columns = ['A_0', 'A_1']
        out = foo_features.build(X_df=X_df, y_df=[])
        assert np.shape(out.X) == (5, 1)
        assert isinstance(out.mapper_X, FeatureEngineeringPipeline)

    # write a new version of foo.load_data.load_data
    new_load_data_str = get_source(load_regression_data)

    p = base.joinpath(modname, 'load_data.py')
    with p.open('w') as f:
        f.write(new_load_data_str)

    # commit changes
    repo.index.add([str(p)])
    repo.index.commit('Load mock regression dataset')

    # call different validation routines
    def call_validate_all(pr=None):
        envvars = {
            'TRAVIS_BUILD_DIR': repo.working_tree_dir,
        }
        if pr is None:
            envvars['TRAVIS_PULL_REQUEST'] = 'false'
            envvars['TRAVIS_COMMIT_RANGE'] = make_commit_range(
                repo.commit('HEAD@{-1}').hexsha, repo.commit('HEAD').hexsha)
            envvars['TRAVIS_PULL_REQUEST_BRANCH'] = ''
            envvars['TRAVIS_BRANCH'] = repo.heads.master.name
        else:
            envvars['TRAVIS_PULL_REQUEST'] = str(pr)
            envvars['TRAVIS_COMMIT_RANGE'] = make_commit_range(
                repo.heads.master.name,
                repo.commit('pull/{pr}'.format(pr=pr)).hexsha)

        with patch.dict(os.environ, envvars):
            cmd = 'ballet validate -A'
            check_call(cmd, cwd=safepath(base), env=os.environ)

    call_validate_all()

    # branch to a fake PR and write a new feature
    contrib_dir = base.joinpath(modname, 'features', 'contrib')
    logger.info('Switching to pull request 1, User Bob, Feature A')
    switch_to_new_branch(repo, 'pull/1')
    new_feature_str = make_feature_str('A_0')
    username = '******'
    featurename = 'A_0'
    submit_feature(repo, contrib_dir, username, featurename, new_feature_str)

    # call different validation routines
    logger.info('Validating pull request 1, User Bob, Feature A')
    call_validate_all(pr=1)

    # merge PR with master
    logger.info('Merging into master')
    repo.git.checkout('master')
    repo.git.merge('pull/1', no_ff=True)

    # call different validation routines
    logger.info('Validating after merge')
    call_validate_all()

    # write another new feature
    logger.info('Switching to pull request 2, User Charlie, Feature Z_1')
    switch_to_new_branch(repo, 'pull/2')
    new_feature_str = make_feature_str('Z_1')
    username = '******'
    featurename = 'Z_1'
    submit_feature(repo, contrib_dir, username, featurename, new_feature_str)

    # if we expect this feature to fail
    with pytest.raises(CalledProcessError):
        logger.info('Validating pull request 2, User Charlie, Feature Z_1')
        call_validate_all(pr=2)

    # write another new feature - redudancy
    repo.git.checkout('master')
    switch_to_new_branch(repo, 'pull/3')
    new_feature_str = make_feature_str('A_0')
    username = '******'
    featurename = 'A_0'
    submit_feature(repo, contrib_dir, username, featurename, new_feature_str)

    with pytest.raises(CalledProcessError):
        call_validate_all(pr=3)
Exemplo n.º 18
0
def render_project_template(project_template_path: Optional[Pathy] = None,
                            create_github_repo: bool = False,
                            github_token: Optional[str] = None,
                            **cc_kwargs) -> str:
    """Generate a ballet project according to the project template

    If creating the GitHub repo is requested and the process fails for any
    reason, quickstart will complete successfully and users are instructed
    to read the corresponding section of the Maintainer's Guide to continue
    manually.

    Args:
        project_template_path: path to specific project template
        create_github_repo: whether to act to create the desired repo on
            GitHub after rendering the project. The repo will be owned by
            either the user or an org that the user has relevant permissions
            for, depending on what is entered during the quickstart prompts.
            If True, then a valid github token must also be provided.
        github_token: valid github token with appropriate permissions
        **cc_kwargs: options for the cookiecutter template
    """
    if project_template_path is None:
        project_template_path = PROJECT_TEMPLATE_PATH

    project_path = cookiecutter(project_template_path, **cc_kwargs)

    if create_github_repo:
        if github_token is None:
            raise ValueError('Need to provide github token')
        g = Github(github_token)

        # need to get params from new project config
        project = Project.from_path(project_path)
        owner = project.config.get('github.github_owner')
        name = project.config.get('project.project_slug')

        # create repo on github
        try:
            github_repo = ballet.util.git.create_github_repo(g, owner, name)
            logger.info(f'Created repo on GitHub at {github_repo.html_url}')
        except GithubException:
            logger.exception('Failed to create GitHub repo for this project')
            logger.warning(
                'Failed to create GitHub repo for this project...\n'
                'did you specify the intended repo owner, and do you have'
                ' permissions to create a repo under that owner?\n'
                'Try manually creating the repo: https://ballet.github.io/ballet/maintainer_guide.html#manual-repository-creation'  # noqa E501
            )
            return project_path

        # now push to remote
        # we don't need to set up the remote, as it has already been setup in
        # post_gen_hook.py
        local_repo = project.repo
        remote_name = project.config.get('github.remote')
        branches = [DEFAULT_BRANCH]
        try:
            push_branches_to_remote(local_repo, remote_name, branches)
        except BalletError:
            logger.exception('Failed to push branches to GitHub repo')
            logger.warning(
                'Failed to push branches to GitHub repo...\n'
                'Try manually pushing the branches: https://ballet.github.io/ballet/maintainer_guide.html#manual-repository-creation'  # noqa E501
            )
            return project_path

    return project_path
Exemplo n.º 19
0
def echo():
    fn = pathlib.Path.cwd().resolve()
    logger.info(f'New project created in {fn!s}')
Exemplo n.º 20
0
def _log_start_new_feature_success(result: List[Tuple[pathlib.Path, str]]):
    logger.info('Start new feature successful')
    for (name, kind) in result:
        if kind == 'file' and '__init__' not in str(name):
            relname = pathlib.Path(name).relative_to(pathlib.Path.cwd())
            logger.info(f'Created {relname}')
Exemplo n.º 21
0
def _log_switch_to_new_branch(branch: Optional[str]):
    if branch is not None:
        logger.info(f'Switched to branch {branch}')
Exemplo n.º 22
0
def echo():
    fn = pathlib.Path.cwd().absolute()
    logger.info('New project created in {!s}'.format(fn))
Exemplo n.º 23
0
def dump_travis_env_vars():
    logger.info(repr(get_travis_env_vars()))
Exemplo n.º 24
0
 def judge(self):
     logger.info(f'Judging feature using {self}')
     return True
Exemplo n.º 25
0
    def judge(self):
        """Judge feature acceptance using GFSSF

        Uses lines 1-8 of agGFSSF where we do not remove accepted but
        redundant features on line 8.
        """
        if np.isnan(self.y_val).any():
            raise ValueError(
                f'{self.__class__.__name__} does not support missing targets,'
                ' please use a different evaluator.')

        logger.info(f'Judging feature using {self}')

        feature_df_map = self._get_feature_df_map()

        candidate_df = feature_df_map[self.candidate_feature]
        n_samples, n_candidate_cols = candidate_df.shape

        lmbda_1, lmbda_2 = _compute_lmbdas(self.lmbda_1, self.lmbda_2,
                                           feature_df_map)

        logger.debug(
            f'Recomputed lambda_1={lmbda_1:0.3e}, lambda_2={lmbda_2:0.3e}')

        info = []

        omit_in_test = [None, *self.features]
        n_omit = len(omit_in_test)
        for i, omitted_feature in enumerate(omit_in_test):

            z = _concat_datasets(
                feature_df_map,
                n_samples,
                omit=[self.candidate_feature, omitted_feature])

            # Calculate CMI of candidate feature
            cmi = estimate_conditional_information(candidate_df, self.y_val, z)

            if omitted_feature is not None:
                omit_df = feature_df_map[omitted_feature]
                _, n_omit_cols = omit_df.shape

                # Calculate CMI of omitted feature
                cmi_omit = estimate_conditional_information(
                    omit_df, self.y_val, z)
            else:
                cmi_omit = 0
                n_omit_cols = 0

                # want to log to INFO only the case of I(Z|Y;X) where X is the
                # entire feature matrix, i.e. no omitted features.
                logger.info(f'I(feature ; target | existing_features) = {cmi}')

            statistic = cmi - cmi_omit
            threshold = _compute_threshold(lmbda_1, lmbda_2, n_candidate_cols,
                                           n_omit_cols)
            delta = statistic - threshold

            if delta >= 0:
                omitted_source = getattr(omitted_feature, 'source', 'None')
                logger.debug(
                    f'Succeeded while omitting feature: {omitted_source}')

                return True
            else:
                iteration_info = GFSSFIterationInfo(
                    i=i,
                    n_samples=n_samples,
                    candidate_feature=self.candidate_feature,
                    candidate_cols=n_candidate_cols,
                    candidate_cmi=cmi,
                    omitted_feature=omitted_feature,
                    omitted_cols=n_omit_cols,
                    omitted_cmi=cmi_omit,
                    statistic=statistic,
                    threshold=threshold,
                    delta=delta,
                )
                info.append(iteration_info)
                logger.debug(
                    f'Completed iteration {i}/{n_omit}: {iteration_info}')

        info_closest = max(info, key=lambda x: x.delta)
        cmi_closest = info_closest.candidate_cmi
        omitted_cmi_closest = info_closest.omitted_cmi
        statistic_closest = info_closest.statistic
        threshold_closest = info_closest.threshold
        logger.info(
            f'Rejected feature: best marginal conditional mutual information was not greater than threshold ({cmi_closest:0.3e} - {omitted_cmi_closest:0.3e} = {statistic_closest:0.3e}, vs needed {threshold_closest:0.3e}).'
        )  # noqa

        return False
Exemplo n.º 26
0
 def judge(self):
     """Accept feature with probability p"""
     logger.info(f'Judging feature using {self}')
     with seeded(self.seed):
         return random.uniform(0, 1) < self.p
Exemplo n.º 27
0
def _log_collect_items(name, items):
    n = len(items)
    s = make_plural_suffix(items)
    logger.info('Collected {n} {name}{s}'.format(n=n, name=name, s=s))
    return items
Exemplo n.º 28
0
 def prune(self):
     logger.info(f'Pruning features using {self}')
     return []
Exemplo n.º 29
0
 def prune(self):
     """With probability p, select a random feature to prune"""
     logger.info(f'Pruning features using {self}')
     with seeded(self.seed):
         if random.uniform(0, 1) < self.p:
             return [random.choice(self.features)]
Exemplo n.º 30
0
 def _log_failure_no_more_approaches(self):
     logger.info('Conversion failed, and we\'re not sure why...')