예제 #1
0
def add_message(uuid):
    """
    This route will be accessed remotely from the Chrome extension. It takes the user
    input (which is the Reddit submission's title and text) and returns the top 3
    predicted subreddits for each model if they are close enough to the decision
    boundary (or are across it). Then it returns the results back to the extension so it
    can display them.
    """
    content = request.json
    title = content["title"]
    text = content["text"]
    threshold = content["threshold"]
    max_predicted_classes = content["max_per_model"]
    X = title + " " + text
    X = vectorizer.transform([X])

    selected_predictions = []
    for i in clf:
        with sklearn.config_context(assume_finite=True):
            my_dec = i.decision_function(X)
            argsorted_dec = my_dec.argsort()[0][::-1]
            argsorted_dec_thresh = argsorted_dec[:max_predicted_classes][
                my_dec[0][argsorted_dec[:max_predicted_classes]] > threshold]
            sorted_classes = i.classes_[argsorted_dec_thresh]
        selected_predictions += list(sorted_classes)

    return jsonify(selected_predictions)
예제 #2
0
파일: test_knn.py 프로젝트: Aathi410/Pro123
def test_knn_imputer_distance_weighted_not_enough_neighbors(
        na, working_memory):
    X = np.array([[3, na], [2, na], [na, 4], [5, 6], [6, 8], [na, 5]])

    dist = pairwise_distances(X,
                              metric="nan_euclidean",
                              squared=False,
                              missing_values=na)

    X_01 = np.average(X[3:5, 1], weights=1 / dist[0, 3:5])
    X_11 = np.average(X[3:5, 1], weights=1 / dist[1, 3:5])
    X_20 = np.average(X[3:5, 0], weights=1 / dist[2, 3:5])
    X_50 = np.average(X[3:5, 0], weights=1 / dist[5, 3:5])

    X_expected = np.array([[3, X_01], [2, X_11], [X_20, 4], [5, 6], [6, 8],
                           [X_50, 5]])

    with config_context(working_memory=working_memory):
        knn_3 = KNNImputer(missing_values=na,
                           n_neighbors=3,
                           weights="distance")
        assert_allclose(knn_3.fit_transform(X), X_expected)

        knn_4 = KNNImputer(missing_values=na,
                           n_neighbors=4,
                           weights="distance")
        assert_allclose(knn_4.fit_transform(X), X_expected)
예제 #3
0
def test_get_chunk_n_rows_warns():
    """Check that warning is raised when working_memory is too low."""
    row_bytes = 1024 * 1024 + 1
    max_n_rows = None
    working_memory = 1
    expected = 1

    warn_msg = (
        "Could not adhere to working_memory config. Currently 1MiB, 2MiB required."
    )
    with pytest.warns(UserWarning, match=warn_msg):
        actual = get_chunk_n_rows(
            row_bytes=row_bytes,
            max_n_rows=max_n_rows,
            working_memory=working_memory,
        )

    assert actual == expected
    assert type(actual) is type(expected)

    with config_context(working_memory=working_memory):
        with pytest.warns(UserWarning, match=warn_msg):
            actual = get_chunk_n_rows(row_bytes=row_bytes,
                                      max_n_rows=max_n_rows)
        assert actual == expected
        assert type(actual) is type(expected)
예제 #4
0
def test_pairwise_distances_data_derived_params(n_jobs, metric, dist_function,
                                                y_is_x):
    # check that pairwise_distances give the same result in sequential and
    # parallel, when metric has data-derived parameters.
    with config_context(working_memory=1):  # to have more than 1 chunk
        rng = np.random.RandomState(0)
        X = rng.random_sample((1000, 10))

        if y_is_x:
            Y = X
            expected_dist_default_params = squareform(pdist(X, metric=metric))
            if metric == "seuclidean":
                params = {'V': np.var(X, axis=0, ddof=1)}
            else:
                params = {'VI': np.linalg.inv(np.cov(X.T)).T}
        else:
            Y = rng.random_sample((1000, 10))
            expected_dist_default_params = cdist(X, Y, metric=metric)
            if metric == "seuclidean":
                params = {'V': np.var(np.vstack([X, Y]), axis=0, ddof=1)}
            else:
                params = {'VI': np.linalg.inv(np.cov(np.vstack([X, Y]).T)).T}

        expected_dist_explicit_params = cdist(X, Y, metric=metric, **params)
        dist = np.vstack(tuple(dist_function(X, Y,
                                             metric=metric, n_jobs=n_jobs)))

        assert_allclose(dist, expected_dist_explicit_params)
        assert_allclose(dist, expected_dist_default_params)
예제 #5
0
def run_bench(repeat=10, verbose=False):

    pbefore = dict(n_neighbors=[2, 5],
                   leaf_size=[10],
                   dim=[2, 5],
                   onnx_options=[
                       None, {
                           KNeighborsClassifier: {
                               'optim': 'cdist',
                               'zipmap': False
                           }
                       }
                   ],
                   metric=["euclidean"])
    pafter = dict(N=[1, 10, 100])

    test = lambda dim=None, **opts: OnnxRuntimeBenchPerfTestBinaryClassification(
        KNeighborsClassifier, dim=dim, **opts)
    bp = BenchPerf(pbefore, pafter, test)

    with sklearn.config_context(assume_finite=True):
        start = time()
        results = list(
            bp.enumerate_run_benchs(repeat=repeat,
                                    verbose=verbose,
                                    stop_if_error=False))
        end = time()

    results_df = pandas.DataFrame(results)
    print("Total time = %0.3f sec\n" % (end - start))
    return results_df
예제 #6
0
def run_bench(repeat=10, verbose=False):

    pbefore = dict(dim=[1, 5, 10, 100],
                   max_depth=[2, 10],
                   n_estimators=[1, 10, 100, 1000, 10000],
                   onnx_options=[{
                       RandomForestClassifier: {
                           'zipmap': False
                       }
                   }])
    pafter = dict(N=[1, 10, 100])

    test = lambda dim=None, **opts: OnnxRuntimeBenchPerfTestBinaryClassification(
        RandomForestClassifier, dim=dim, **opts)
    bp = BenchPerf(pbefore, pafter, test)

    with sklearn.config_context(assume_finite=True):
        start = time()
        results = list(
            bp.enumerate_run_benchs(repeat=repeat,
                                    verbose=verbose,
                                    stop_if_error=False))
        end = time()

    results_df = pandas.DataFrame(results)
    print("Total time = %0.3f sec\n" % (end - start))
    return results_df
예제 #7
0
def already_posted(uuid):
    """
    This route is accessed from submissions pages that already exist. It returns the top
    3 predictions for each model if they meet the threshold specified above. It will not
    suggest a subreddit the post is actually in. It also uses the Reddit API to get the
    actual information about the page.
    """
    content = request.json
    submission = praw.models.Submission(reddit, url=content["url"])
    threshold = content["threshold"]
    max_predicted_classes = content["max_per_model"]

    title = submission.title
    text = submission.selftext
    subreddit = submission.subreddit
    X = title + " " + text
    X = vectorizer.transform([X])

    selected_predictions = []
    for i in clf:
        with sklearn.config_context(assume_finite=True):
            my_dec = i.decision_function(X)
            argsorted_dec = my_dec.argsort()[0][::-1]
            argsorted_dec_thresh = argsorted_dec[:max_predicted_classes][
                my_dec[0][argsorted_dec[:max_predicted_classes]] > threshold]
            sorted_classes = i.classes_[argsorted_dec_thresh]
        selected_predictions += list(sorted_classes)

    # Remove prediction if it is the same subreddit you are in
    if subreddit in selected_predictions:
        selected_predictions.remove(subreddit)

    return jsonify(selected_predictions)
def test_one_estimator_print_change_only(print_changed_only):
    pca = PCA(n_components=10)

    with config_context(print_changed_only=print_changed_only):
        pca_repr = str(pca)
        html_output = estimator_html_repr(pca)
        assert pca_repr in html_output
예제 #9
0
def test_kwargs_in_init():
    # Make sure the changed_only=True mode is OK when an argument is passed as
    # kwargs.
    # Non-regression test for
    # https://github.com/scikit-learn/scikit-learn/issues/17206

    class WithKWargs(BaseEstimator):
        # Estimator with a kwargs argument. These need to hack around
        # set_params and get_params. Here we mimic what LightGBM does.
        def __init__(self, a='willchange', b='unchanged', **kwargs):
            self.a = a
            self.b = b
            self._other_params = {}
            self.set_params(**kwargs)

        def get_params(self, deep=True):
            params = super().get_params(deep=deep)
            params.update(self._other_params)
            return params

        def set_params(self, **params):
            for key, value in params.items():
                setattr(self, key, value)
                self._other_params[key] = value
            return self

    est = WithKWargs(a='something', c='abcd', d=None)

    expected = "WithKWargs(a='something', c='abcd', d=None)"
    assert expected == est.__repr__()

    with config_context(print_changed_only=False):
        expected = "WithKWargs(a='something', b='unchanged', c='abcd', d=None)"
        assert expected == est.__repr__()
def run_bench(repeat=10, verbose=False):

    pbefore = dict(
        dim=[1, 5, 10, 20],
        alpha=[0.1, 1., 10.],
        onnx_options=[None, {
            GaussianProcessRegressor: {
                'optim': 'cdist'
            }
        }],
        dtype=[numpy.float32, numpy.float64])
    pafter = dict(N=[1, 10, 100, 1000])

    test = lambda dim=None, **opts: OnnxRuntimeBenchPerfTestRegression(
        GaussianProcessRegressor, dim=dim, N_fit=100, **opts)
    bp = BenchPerf(pbefore, pafter, test)

    with sklearn.config_context(assume_finite=True):
        start = time()
        results = list(
            bp.enumerate_run_benchs(repeat=repeat,
                                    verbose=verbose,
                                    stop_if_error=False))
        end = time()

    results_df = pandas.DataFrame(results)
    print("Total time = %0.3f sec\n" % (end - start))
    return results_df
예제 #11
0
def main(m="LogisticRegression",
         e=100,
         n=10000,
         f=10,
         r=1000,
         a=True,
         o=True,
         j=2,
         opts=""):
    """
    Builds a model and benchmarks the model converted into ONNX.

    :param m: model name or experiment
    :param e: number of estimators or trees
    :param n: number of rows
    :param f: number of features
    :param r: number of repetitions
    :param a: assume finite or not
    :param o: compares to ONNX
    :param j: n_jobs
    :param opts: options
    """
    model_data = build_model(m, e, n, f, o, j, opts)

    if a:
        with config_context(assume_finite=True):
            benchmark(model_data['model'], model_data.get('onnx', None),
                      model_data['data'], r)
    else:
        benchmark(model_data['model'], model_data.get('onnx', None),
                  model_data['data'], r)
예제 #12
0
파일: test_knn.py 프로젝트: Aathi410/Pro123
def test_knn_imputer_with_simple_example(na, working_memory):

    X = np.array([
        [0, na, 0, na],
        [1, 1, 1, na],
        [2, 2, na, 2],
        [3, 3, 3, 3],
        [4, 4, 4, 4],
        [5, 5, 5, 5],
        [6, 6, 6, 6],
        [na, 7, 7, 7],
    ])

    r0c1 = np.mean(X[1:6, 1])
    r0c3 = np.mean(X[2:-1, -1])
    r1c3 = np.mean(X[2:-1, -1])
    r2c2 = np.mean(X[[0, 1, 3, 4, 5], 2])
    r7c0 = np.mean(X[2:-1, 0])

    X_imputed = np.array([
        [0, r0c1, 0, r0c3],
        [1, 1, 1, r1c3],
        [2, 2, r2c2, 2],
        [3, 3, 3, 3],
        [4, 4, 4, 4],
        [5, 5, 5, 5],
        [6, 6, 6, 6],
        [r7c0, 7, 7, 7],
    ])

    with config_context(working_memory=working_memory):
        imputer_comp = KNNImputer(missing_values=na)
        assert_allclose(imputer_comp.fit_transform(X), X_imputed)
def test_pairwise_distances_data_derived_params(n_jobs, metric, dist_function,
                                                y_is_x):
    # check that pairwise_distances give the same result in sequential and
    # parallel, when metric has data-derived parameters.
    with config_context(working_memory=1):  # to have more than 1 chunk
        rng = np.random.RandomState(0)
        X = rng.random_sample((1000, 10))

        if y_is_x:
            Y = X
            expected_dist_default_params = squareform(pdist(X, metric=metric))
            if metric == "seuclidean":
                params = {'V': np.var(X, axis=0, ddof=1)}
            else:
                params = {'VI': np.linalg.inv(np.cov(X.T)).T}
        else:
            Y = rng.random_sample((1000, 10))
            expected_dist_default_params = cdist(X, Y, metric=metric)
            if metric == "seuclidean":
                params = {'V': np.var(np.vstack([X, Y]), axis=0, ddof=1)}
            else:
                params = {'VI': np.linalg.inv(np.cov(np.vstack([X, Y]).T)).T}

        expected_dist_explicit_params = cdist(X, Y, metric=metric, **params)
        dist = np.vstack(
            tuple(dist_function(X, Y, metric=metric, n_jobs=n_jobs)))

        assert_allclose(dist, expected_dist_explicit_params)
        assert_allclose(dist, expected_dist_default_params)
def run_bench(repeat=5, verbose=False):

    pbefore = dict(dim=[-1],
                   model=list(
                       sorted([
                           'XGB', 'LGB', 'SVR', 'NuSVR', 'RF', 'DT', 'ADA',
                           'MLP', 'LR', 'GBT', 'KNN', 'KNN-cdist', 'HGB'
                       ])),
                   norm=[False, True],
                   dataset=["boston", "diabetes", "rndbin100"])
    pafter = dict(N=[
        1, 2, 5, 10, 20, 50, 100, 200, 500, 1000, 2000, 5000, 10000, 20000,
        50000
    ])

    test = lambda dim=None, **opts: DatasetsOrtBenchPerfTest(**opts)
    bp = BenchPerf(pbefore, pafter, test)

    with sklearn.config_context(assume_finite=True):
        start = time()
        results = list(
            bp.enumerate_run_benchs(repeat=repeat,
                                    verbose=verbose,
                                    stop_if_error=False))
        end = time()

    results_df = pandas.DataFrame(results)
    print("Total time = %0.3f sec\n" % (end - start))
    return results_df
def benchmark(X, fct1, fct2, N, repeat=10, number=20):

    def ti(r, n):
        if n <= 1:
            return 40 * r
        if n <= 10:
            return 10 * r
        if n <= 100:
            return 4 * r
        if n <= 1000:
            return r
        return r // 2

    with sklearn.config_context(assume_finite=True):
        # to warm up the engine
        time_kwargs = {n: dict(repeat=10, number=10) for n in N}
        benchmark_fct(fct1, X, time_kwargs=time_kwargs, skip_long_test=False)
        benchmark_fct(fct2, X, time_kwargs=time_kwargs, skip_long_test=False)
        # real measure
        time_kwargs = {n: dict(repeat=ti(repeat, n), number=number) for n in N}
        res1 = benchmark_fct(
            fct1, X, time_kwargs=time_kwargs, skip_long_test=False)
        res2 = benchmark_fct(
            fct2, X, time_kwargs=time_kwargs, skip_long_test=False)
    res = {}
    for r in sorted(res1):
        r1 = res1[r]
        r2 = res2[r]
        ratio = r2['ttime'] / r1['ttime']
        res[r] = ratio
    return res
예제 #16
0
파일: models.py 프로젝트: thecobb/dabl
    def fit(self, X, y=None, *, target_col=None):
        """Fit estimator.

        Requires to either specify the target as separate 1d array or Series y
        (in scikit-learn fashion) or as column of the dataframe X specified by
        target_col.
        If y is specified, X is assumed not to contain the target.

        Parameters
        ----------
        X : DataFrame
            Input features. If target_col is specified, X also includes the
            target.
        y : Series or numpy array, optional.
            Target. You need to specify either y or target_col.
        target_col : string or int, optional
            Column name of target if included in X.
        """
        # copy and paste from above?!
        if ((y is None and target_col is None)
                or (y is not None) and (target_col is not None)):
            raise ValueError(
                "Need to specify either y or target_col.")
        X, y = _validate_Xyt(X, y, target_col, do_clean=False)
        if not isinstance(X, pd.DataFrame):
            X = pd.DataFrame(X)

        types = detect_types(X, type_hints=self.type_hints)
        self.feature_names_ = X.columns
        self.types_ = types
        cv = 5
        ratio = 3

        y, self.scoring_ = self._preprocess_target(y)
        self.log_ = []

        # reimplement cross-validation so we only do preprocessing once
        pipe = Pipeline([('preprocessing',
                          EasyPreprocessor(verbose=self.verbose, types=types)),
                         ('classifier', DummyClassifier())])

        estimators = self._get_estimators()
        param_grid = [{'classifier': [est]} for est in estimators]
        gs = GridSuccessiveHalving(
            ratio=ratio,
            estimator=pipe, param_grid=param_grid,
            force_exhaust_budget=self.force_exhaust_budget,
            verbose=self.verbose, cv=cv, error_score='raise',
            scoring=self.scoring_, refit='recall_macro', n_jobs=self.n_jobs)
        self.search_ = gs
        with sklearn.config_context(print_changed_only=True):
            gs.fit(X, y)
        self.est_ = gs.best_estimator_

        print("best classifier: ", gs.best_params_['classifier'])
        print("best score: {:.3f}".format(gs.best_score_))

        return self
예제 #17
0
def test_config_context_exception():
    assert get_config()['assume_finite'] is False
    try:
        with config_context(assume_finite=True):
            assert get_config()['assume_finite'] is True
            raise ValueError()
    except ValueError:
        pass
    assert get_config()['assume_finite'] is False
예제 #18
0
def test_config_context_exception():
    assert get_config()['assume_finite'] is False
    try:
        with config_context(assume_finite=True):
            assert get_config()['assume_finite'] is True
            raise ValueError()
    except ValueError:
        pass
    assert get_config()['assume_finite'] is False
예제 #19
0
def test_config_context_exception():
    assert_equal(get_config(), {'assume_finite': False})
    try:
        with config_context(assume_finite=True):
            assert_equal(get_config(), {'assume_finite': True})
            raise ValueError()
    except ValueError:
        pass
    assert_equal(get_config(), {'assume_finite': False})
예제 #20
0
def test_config_context():
    assert get_config() == {
        'assume_finite': False,
        'working_memory': 1024,
        'print_changed_only': True,
        'display': 'text'
    }

    # Not using as a context manager affects nothing
    config_context(assume_finite=True)
    assert get_config()['assume_finite'] is False

    with config_context(assume_finite=True):
        assert get_config() == {
            'assume_finite': True,
            'working_memory': 1024,
            'print_changed_only': True,
            'display': 'text'
        }
    assert get_config()['assume_finite'] is False

    with config_context(assume_finite=True):
        with config_context(assume_finite=None):
            assert get_config()['assume_finite'] is True

        assert get_config()['assume_finite'] is True

        with config_context(assume_finite=False):
            assert get_config()['assume_finite'] is False

            with config_context(assume_finite=None):
                assert get_config()['assume_finite'] is False

                # global setting will not be retained outside of context that
                # did not modify this setting
                set_config(assume_finite=True)
                assert get_config()['assume_finite'] is True

            assert get_config()['assume_finite'] is False

        assert get_config()['assume_finite'] is True

    assert get_config() == {
        'assume_finite': False,
        'working_memory': 1024,
        'print_changed_only': True,
        'display': 'text'
    }

    # No positional arguments
    assert_raises(TypeError, config_context, True)
    # No unknown arguments
    assert_raises(TypeError, config_context(do_something_else=True).__enter__)
예제 #21
0
def test_repr_html_wraps():
    # Checks the display configuration flag controls the html output
    tree = DecisionTreeClassifier()
    msg = "_repr_html_ is only defined when"
    with pytest.raises(AttributeError, match=msg):
        output = tree._repr_html_()

    with config_context(display='diagram'):
        output = tree._repr_html_()
        assert "<style>" in output
예제 #22
0
def test_convert_arff_data_dataframe_warning_low_memory_pandas(monkeypatch):
    pytest.importorskip('pandas')

    data_id = 1119
    _monkey_patch_webbased_functions(monkeypatch, data_id, True)

    msg = 'Could not adhere to working_memory config.'
    with pytest.warns(UserWarning, match=msg):
        with config_context(working_memory=1e-6):
            fetch_openml(data_id=data_id, as_frame=True, cache=False)
예제 #23
0
def test_repr_mimebundle_():
    # Checks the display configuration flag controls the json output
    tree = DecisionTreeClassifier()
    output = tree._repr_mimebundle_()
    assert "text/plain" in output
    assert "text/html" not in output

    with config_context(display='diagram'):
        output = tree._repr_mimebundle_()
        assert "text/plain" in output
        assert "text/html" in output
def test_birch_duck_typing_meta():
    # Test duck typing meta estimators with Birch
    birch = Birch(n_clusters=AgglomerativeClustering(n_clusters=3))
    html_output = estimator_html_repr(birch)

    # inner estimators do not show changes
    with config_context(print_changed_only=True):
        assert f"<pre>{str(birch.n_clusters)}" in html_output
        assert "AgglomerativeClustering</label>" in html_output

    # outer estimator contains all changes
    assert f"<pre>{str(birch)}" in html_output
def test_ovo_classifier_duck_typing_meta():
    # Test duck typing metaestimators with OVO
    ovo = OneVsOneClassifier(LinearSVC(penalty="l1"))
    html_output = estimator_html_repr(ovo)

    # inner estimators do not show changes
    with config_context(print_changed_only=True):
        assert f"<pre>{str(ovo.estimator)}" in html_output
        assert "LinearSVC</label>" in html_output

    # outer estimator
    assert f"<pre>{str(ovo)}" in html_output
예제 #26
0
def run_bench(repeat=100, verbose=False):
    pbefore = dict(dim=[5, 10, 50])
    pafter = dict(N=[10, 100, 1000])
    bp = BenchPerf(pbefore, pafter, PolyBenchPerfTest)

    with sklearn.config_context(assume_finite=True):
        start = time()
        results = list(bp.enumerate_run_benchs(repeat=repeat, verbose=verbose))
        end = time()

    results_df = pandas.DataFrame(results)
    print("Total time = %0.3f sec\n" % (end - start))
    return results_df
예제 #27
0
    def preview(self, *, display: str = "text") -> str:
        """
        Create a text representation of the model.

        :param display. If ‘diagram’, estimators will be displayed as a diagram in an
            HTML format when shown in a jupyter notebook. If ‘text’, estimators will be
            displayed as text.
        :return. A string representation of the models internal configuration.
        """
        if self.model:
            with config_context(display=display):
                return str(self.model)
        else:
            return ""
예제 #28
0
def run_bench(repeat=100, verbose=False):
    n_obs = [10, 100, 1000]
    n_features = [5, 10, 50]

    with sklearn.config_context(assume_finite=True):
        start = time()
        results = bench(n_obs, n_features, repeat=repeat, verbose=verbose)
        end = time()

    results_df = pandas.DataFrame(results)
    print("Total time = %0.3f sec\n" % (end - start))

    # plot the results
    plot_results(results_df, verbose=verbose)
    return results_df
예제 #29
0
def _run_skl_prediction(obs, check_runtime, assume_finite, inst, method_name,
                        predict_kwargs, X_test, benchmark, debug, verbose,
                        time_kwargs, skip_long_test, time_kwargs_fact, fLOG):
    if not check_runtime:
        return None  # pragma: no cover
    if verbose >= 2 and fLOG is not None:
        fLOG("[enumerate_compatible_opset] check_runtime SKL {}-{}-{}-{}-{}".
             format(id(inst), method_name, predict_kwargs, time_kwargs,
                    time_kwargs_fact))
    with sklearn.config_context(assume_finite=assume_finite):
        # compute sklearn prediction
        obs['ort_version'] = ort_version
        try:
            meth = getattr(inst, method_name)
        except AttributeError as e:  # pragma: no cover
            if debug:
                raise  # pragma: no cover
            obs['_2skl_meth_exc'] = str(e)
            return e
        try:
            ypred, t4, ___ = _measure_time(
                lambda: meth(X_test, **predict_kwargs))
            obs['lambda-skl'] = (lambda xo: meth(xo, **predict_kwargs), X_test)
        except (
                ValueError,
                AttributeError,  # pragma: no cover
                TypeError,
                MemoryError,
                IndexError) as e:
            if debug:
                raise  # pragma: no cover
            obs['_3prediction_exc'] = str(e)
            return e
        obs['prediction_time'] = t4
        obs['assume_finite'] = assume_finite
        if benchmark and 'lambda-skl' in obs:
            obs['bench-skl'] = benchmark_fct(*obs['lambda-skl'],
                                             obs=obs,
                                             time_kwargs=_multiply_time_kwargs(
                                                 time_kwargs, time_kwargs_fact,
                                                 inst),
                                             skip_long_test=skip_long_test)
        if verbose >= 3 and fLOG is not None:
            fLOG("[enumerate_compatible_opset] scikit-learn prediction")
            _dispsimple(ypred, fLOG)
        if verbose >= 2 and fLOG is not None:
            fLOG("[enumerate_compatible_opset] predictions stored")
    return ypred
예제 #30
0
def get_distance(x1, x2, triplet_similarity, mode='numpy'):
    n_jobs = 8 if x1.shape[0] > 1 else 1
    is_item = False
    if isinstance(x1, np.ndarray):
        if len(x1.shape) == 1:
            is_item = True
            x1 = np.expand_dims(x1, 0)
            x2 = np.expand_dims(x2, 0)
    else:
        if len(x1.size()) == 1:
            x1 = x1.unsqueeze(0)
            x2 = x2.unsqueeze(0)
            is_item = True

    if mode != 'numpy':
        if isinstance(x1, np.ndarray):
            x1 = torch.FloatTensor(x1).to(mode)
            x2 = torch.FloatTensor(x2).to(mode)

    # latest needed: conda install -c anaconda scikit-learn
    if isinstance(x1, np.ndarray):
        with sklearn.config_context(working_memory=1024):
            if triplet_similarity == 'cos':
                dist = np.zeros((0, ))
                for each in sklearn.metrics.pairwise.pairwise_distances_chunked(
                        x1, x2, metric="cosine", n_jobs=n_jobs):
                    dist = np.concatenate((dist, np.diag(each)), axis=0)
            else:
                dist = np.zeros((0, ))
                for each in sklearn.metrics.pairwise.pairwise_distances_chunked(
                        x1, x2, metric="euclidean", n_jobs=n_jobs):
                    dist = np.concatenate((dist, np.diag(each)), axis=0)
    else:
        if triplet_similarity == 'cos':
            dist = 1. - F.cosine_similarity(x1, x2, dim=1,
                                            eps=1e-20)  # -1 .. 1 => 0 .. 2
        else:
            dist = F.pairwise_distance(x1, x2, eps=1e-20)  # 0 .. 2

    if mode != 'numpy':
        if isinstance(x1, np.ndarray):
            dist = dist.to('cpu').numpy()

    if is_item:
        dist = dist[0]

    return dist
예제 #31
0
def test_get_chunk_n_rows(row_bytes, max_n_rows, working_memory, expected):
    with warnings.catch_warnings():
        warnings.simplefilter("error", UserWarning)
        actual = get_chunk_n_rows(
            row_bytes=row_bytes,
            max_n_rows=max_n_rows,
            working_memory=working_memory,
        )

    assert actual == expected
    assert type(actual) is type(expected)
    with config_context(working_memory=working_memory):
        with warnings.catch_warnings():
            warnings.simplefilter("error", UserWarning)
            actual = get_chunk_n_rows(row_bytes=row_bytes, max_n_rows=max_n_rows)
        assert actual == expected
        assert type(actual) is type(expected)
예제 #32
0
def test_pairwise_distances_argmin_min(X_blobs):
    centers = X_blobs[::100].compute()

    # X_blobs has 500 rows per block.
    # Ensure 500 rows in the scikit-learn version too.
    working_memory = float(80 * 500) / 2**20

    ctx = sklearn.config_context(working_memory=working_memory)

    with ctx:
        a_, b_ = sklearn.metrics.pairwise_distances_argmin_min(
            X_blobs.compute(), centers)
        a, b = dask_ml.metrics.pairwise_distances_argmin_min(X_blobs, centers)
        a, b = dask.compute(a, b)

    npt.assert_array_equal(a, a_)
    npt.assert_array_equal(b, b_)
예제 #33
0
def test_config_context():
    assert get_config() == {'assume_finite': False, 'working_memory': 1024,
                            'print_changed_only': False}

    # Not using as a context manager affects nothing
    config_context(assume_finite=True)
    assert get_config()['assume_finite'] is False

    with config_context(assume_finite=True):
        assert get_config() == {'assume_finite': True, 'working_memory': 1024,
                                'print_changed_only': False}
    assert get_config()['assume_finite'] is False

    with config_context(assume_finite=True):
        with config_context(assume_finite=None):
            assert get_config()['assume_finite'] is True

        assert get_config()['assume_finite'] is True

        with config_context(assume_finite=False):
            assert get_config()['assume_finite'] is False

            with config_context(assume_finite=None):
                assert get_config()['assume_finite'] is False

                # global setting will not be retained outside of context that
                # did not modify this setting
                set_config(assume_finite=True)
                assert get_config()['assume_finite'] is True

            assert get_config()['assume_finite'] is False

        assert get_config()['assume_finite'] is True

    assert get_config() == {'assume_finite': False, 'working_memory': 1024,
                            'print_changed_only': False}

    # No positional arguments
    assert_raises(TypeError, config_context, True)
    # No unknown arguments
    assert_raises(TypeError, config_context(do_something_else=True).__enter__)
예제 #34
0
def test_config_context():
    assert_equal(get_config(), {'assume_finite': False})

    # Not using as a context manager affects nothing
    config_context(assume_finite=True)
    assert_equal(get_config(), {'assume_finite': False})

    with config_context(assume_finite=True):
        assert_equal(get_config(), {'assume_finite': True})
    assert_equal(get_config(), {'assume_finite': False})

    with config_context(assume_finite=True):
        with config_context(assume_finite=None):
            assert_equal(get_config(), {'assume_finite': True})

        assert_equal(get_config(), {'assume_finite': True})

        with config_context(assume_finite=False):
            assert_equal(get_config(), {'assume_finite': False})

            with config_context(assume_finite=None):
                assert_equal(get_config(), {'assume_finite': False})

                # global setting will not be retained outside of context that
                # did not modify this setting
                set_config(assume_finite=True)
                assert_equal(get_config(), {'assume_finite': True})

            assert_equal(get_config(), {'assume_finite': False})

        assert_equal(get_config(), {'assume_finite': True})

    assert_equal(get_config(), {'assume_finite': False})

    # No positional arguments
    assert_raises(TypeError, config_context, True)
    # No unknown arguments
    assert_raises(TypeError, config_context(do_something_else=True).__enter__)
예제 #35
0
def test_get_chunk_n_rows(row_bytes, max_n_rows, working_memory,
                          expected, warning):
    if warning is not None:
        def check_warning(*args, **kw):
            return assert_warns_message(UserWarning, warning, *args, **kw)
    else:
        check_warning = assert_no_warnings

    actual = check_warning(get_chunk_n_rows,
                           row_bytes=row_bytes,
                           max_n_rows=max_n_rows,
                           working_memory=working_memory)

    assert actual == expected
    assert type(actual) is type(expected)
    with config_context(working_memory=working_memory):
        actual = check_warning(get_chunk_n_rows,
                               row_bytes=row_bytes,
                               max_n_rows=max_n_rows)
        assert actual == expected
        assert type(actual) is type(expected)