Пример #1
0
def compute():
    """Create PFA for kNN."""
    inputs = io_helper.fetch_data()
    dep_var = inputs["data"]["dependent"][0]
    indep_vars = inputs["data"]["independent"]
    params = parameters.fetch_parameters()

    if dep_var['type']['name'] in ('polynominal', 'binominal'):
        job_type = 'classification'
    else:
        job_type = 'regression'

    logging.info('Creating new estimator')
    estimator = _create_estimator(job_type, params)
    featurizer = _create_featurizer(indep_vars)

    # convert variables into dataframe
    X = io_helper.fetch_dataframe(variables=[dep_var] + indep_vars)
    X = utils.remove_nulls(X)
    y = X.pop(dep_var['name'])
    X = featurizer.transform(X)

    # Drop NaN values
    estimator.fit(X, y)

    # Create PFA from the estimator
    types = [(var['name'], var['type']['name']) for var in indep_vars]
    pfa = sklearn_to_pfa(estimator, types, featurizer.generate_pretty_pfa())
    pfa['name'] = "kNN"

    # Save or update job_result
    logging.info('Saving PFA to job_results table...')
    pfa = json.dumps(pfa)
    io_helper.save_results(pfa, shapes.Shapes.PFA)
def compute():
    """Create PFA for kNN."""
    # Read intermediate inputs from jobs
    logging.info("Fetching intermediate data...")

    inputs = io_helper.fetch_data()
    indep_vars = inputs["data"]["independent"]

    # Extract hyperparameters from ENV variables
    k = parameters.get_param('n_clusters', int, DEFAULT_N_CLUSTERS)

    # featurization
    featurizer = _create_featurizer(indep_vars)

    # convert variables into dataframe
    X = io_helper.fetch_dataframe(variables=indep_vars)
    X = utils.remove_nulls(X, errors='ignore')
    X = featurizer.transform(X)

    estimator = KMeans(n_clusters=k)
    estimator.fit(X)

    # Generate PFA for kmeans
    types = [(var['name'], var['type']['name']) for var in indep_vars]
    pfa = sklearn_to_pfa(estimator, types, featurizer.generate_pretty_pfa())

    # Add centroids as metadata
    pfa['metadata'] = {
        'centroids': json.dumps(estimator.cluster_centers_.tolist())
    }

    # Save or update job_result
    logging.info('Saving PFA to job_results table')
    io_helper.save_results(json.dumps(pfa), shapes.Shapes.PFA)
    logging.info("DONE")
Пример #3
0
def test_estimator_to_pfa_mlp_classifier():
    """Check that converted PFA is giving the same results as MLPClassifier"""
    X, y, types = _classification_task()
    estimator = _mlp_classifier(X, y, classes=['a', 'b', 'c'])

    pfa = sklearn_to_pfa(estimator, types)

    estimator_pred = estimator.predict(X)
    pfa_pred = _predict_pfa(X, types, pfa)

    assert all(estimator_pred == pfa_pred)
Пример #4
0
def test_estimator_to_pfa_mlp_regressor():
    """Check that converted PFA is giving the same results as MLPRegressor"""
    X, y, types = _regression_task()
    estimator = _mlp_regressor(X, y)

    pfa = sklearn_to_pfa(estimator, types)

    estimator_pred = estimator.predict(X)
    pfa_pred = _predict_pfa(X, types, pfa)

    assert _arrays_equal(estimator_pred, pfa_pred)
Пример #5
0
def _generate_pfa_regressor(result, indep_vars, featurizer):
    # Create mock SGDRegressor for sklearn_to_pfa
    estimator = SGDRegressor()
    estimator.intercept_ = [result['intercept']]
    # NOTE: linearly dependent columns will be assigned 0
    estimator.coef_ = [
        result.get(c, {'coef': 0.})['coef'] for c in featurizer.columns
        if c != 'intercept'
    ]

    types = [(var['name'], var['type']['name']) for var in indep_vars]
    return sklearn_to_pfa(estimator, types, featurizer.generate_pretty_pfa())
Пример #6
0
def test_estimator_to_pfa_kneighborsclassifier():
    """Check that converted PFA is giving the same results as KNeighborsClassifier"""
    X, y, types = _classification_task()

    estimator = _kneighborsclassifier(X, y, n_neighbors=2)

    pfa = sklearn_to_pfa(estimator, types)

    estimator_pred = estimator.predict(X)
    pfa_pred = _predict_pfa(X, types, pfa)

    assert all(estimator_pred == pfa_pred)
Пример #7
0
def test_estimator_to_pfa_kmeans():
    """Check that converted PFA is giving the same results as KMeans"""
    X, _, types = _classification_task()

    estimator = _kmeans(X, n_clusters=2)

    pfa = sklearn_to_pfa(estimator, types)

    estimator_pred = estimator.predict(X)
    pfa_pred = _predict_pfa(X, types, pfa)

    assert all(estimator_pred == pfa_pred)
Пример #8
0
def test_estimator_to_pfa_mixednb(dtypes):
    """Check that converted PFA is giving the same results as MixedNB"""
    X, y, types = _classification_task(dtypes=dtypes)

    is_nominal = [t == 'n' for t in dtypes]
    estimator = _mixednb(X, y, is_nominal=is_nominal, classes=['a', 'b', 'c'])

    pfa = sklearn_to_pfa(estimator, types)

    estimator_pred = estimator.predict(X)
    pfa_pred = _predict_pfa(X, types, pfa)

    assert all(estimator_pred == pfa_pred)
Пример #9
0
def test_estimator_to_pfa_gradientboostingclassifier():
    """Check that converted PFA is giving the same results as GradientBoostingClassifier"""
    X, y, types = _classification_task()

    estimator = _gradientboostingclassifier(X,
                                            y,
                                            n_estimators=10,
                                            learning_rate=0.1)

    pfa = sklearn_to_pfa(estimator, types)

    estimator_pred = estimator.predict(X)
    pfa_pred = _predict_pfa(X, types, pfa)

    assert all(estimator_pred == pfa_pred)
Пример #10
0
def test_estimator_to_pfa_gradientboostingregressor():
    """Check that converted PFA is giving the same results as GradientBoostingRegressor"""
    X, y, types = _regression_task()

    estimator = _gradientboostingregressor(X,
                                           y,
                                           n_estimators=10,
                                           learning_rate=0.1)

    pfa = sklearn_to_pfa(estimator, types)

    estimator_pred = estimator.predict(X)
    pfa_pred = _predict_pfa(X, types, pfa)

    np.testing.assert_almost_equal(estimator_pred, pfa_pred, decimal=5)
Пример #11
0
def test_estimator_to_pfa_mixednb_zero_prior():
    """Check that converted PFA is giving the same results as MultinomialNB with category that has no values."""
    dtypes = 'ccn'
    X, y, types = _classification_task(n_features=3, dtypes=dtypes)
    y[:] = 'a'

    is_nominal = [t == 'n' for t in dtypes]
    estimator = _mixednb(X, y, is_nominal=is_nominal, classes=['a', 'b', 'c'])

    pfa = sklearn_to_pfa(estimator, types)

    estimator_pred = estimator.predict(X)
    pfa_pred = _predict_pfa(X, types, pfa)

    assert all(estimator_pred == pfa_pred)
Пример #12
0
def test_estimator_to_pfa_multinomialnb():
    """Check that converted PFA is giving the same results as MultinomialNB"""
    X, y, types = _classification_task()

    # artifically create 0, 1 inputs from X because `MultinomialNB` works only with counts
    X = (X > 0).astype(int)

    estimator = _multinomialnb(X, y, classes=['a', 'b', 'c'])

    pfa = sklearn_to_pfa(estimator, types)

    estimator_pred = estimator.predict(X)
    pfa_pred = _predict_pfa(X, types, pfa)

    assert all(estimator_pred == pfa_pred)
Пример #13
0
def test_estimator_to_pfa_gradientboostingclassifier_nosplits():
    X, y, types = _classification_task()

    # `min_samples_split` guarantees there will be no splits
    estimator = _gradientboostingclassifier(X,
                                            y,
                                            min_samples_split=1000000,
                                            n_estimators=10,
                                            learning_rate=0.1)

    pfa = sklearn_to_pfa(estimator, types)

    estimator_pred = estimator.predict(X)
    pfa_pred = _predict_pfa(X, types, pfa)

    assert all(estimator_pred == pfa_pred)
def aggregate_kmeans(job_ids):
    """Compute merging of clusters according to least merging error (e.g. smallest distance betweeen centroids)
    :input job_ids: list of job_ids with intermediate results
    """
    # Read intermediate inputs from jobs
    logging.info("Fetching intermediate data...")
    data = [
        json.loads(io_helper.get_results(str(job_id)).data)
        for job_id in job_ids
    ]

    local_centroids = [
        np.array(x['centroids']) for x in data if x['centroids']
    ]
    indep_vars = data[0]['indep_vars']

    # Aggregate clusters remotely
    remote_centroids = remote.aggregate_clusters(local_centroids)
    logging.info("Centroids:\n{}".format(remote_centroids))

    # Create fake KMeans estimator and assign it our centroids
    estimator = KMeans()
    estimator.cluster_centers_ = np.array(remote_centroids)

    # Generate PFA for kmeans and add centroids to metadata
    featurizer = _create_featurizer(indep_vars)
    types = [(var['name'], var['type']['name']) for var in indep_vars]
    pfa = sklearn_to_pfa(estimator, types, featurizer.generate_pretty_pfa())

    # Add serialized model as metadata
    pfa['metadata'] = {
        'centroids': json.dumps(np.array(remote_centroids).tolist())
    }

    # Save or update job_result
    logging.info('Saving PFA to job_results table')
    pfa = json.dumps(pfa)
    io_helper.save_results(pfa, shapes.Shapes.PFA)
    logging.info("DONE")
Пример #15
0
def main(job_id, generate_pfa):
    inputs = io_helper.fetch_data()
    dep_var = inputs["data"]["dependent"][0]
    indep_vars = inputs["data"]["independent"]

    if dep_var['type']['name'] in ('polynominal', 'binominal'):
        job_type = 'classification'
    else:
        job_type = 'regression'

    # Get existing results with partial model if they exist
    if job_id:
        job_result = io_helper.get_results(job_id=str(job_id))

        logging.info('Loading existing estimator')
        estimator = deserialize_sklearn_estimator(json.loads(job_result.data)['estimator'])
    else:
        logging.info('Creating new estimator')
        estimator = _create_estimator(job_type)

    # featurization
    featurizer = _create_featurizer(indep_vars, estimator)

    # convert variables into dataframe
    X = io_helper.fetch_dataframe(variables=[dep_var] + indep_vars)
    X = utils.remove_nulls(X, errors='ignore')
    y = X.pop(dep_var['name'])

    X = featurizer.transform(X)

    if len(X) == 0:
        # log error, but still save the estimator
        logging.warning("All data are NULL, cannot fit model")
    else:
        # Train single step
        if hasattr(estimator, 'partial_fit'):
            if job_type == 'classification':
                estimator.partial_fit(X, y, classes=dep_var['type']['enumeration'])
            else:
                estimator.partial_fit(X, y)
        else:
            if not generate_pfa:
                logging.warning('{} does not support partial fit.'.format(estimator))
            if isinstance(estimator, GradientBoostingClassifier) and len(set(y)) == 1:
                raise errors.UserError(
                    'All outputs have single category ({}), Gradient boosting cannot fit that.'.format(y.iloc[0])
                )
            estimator.fit(X, y)

    if generate_pfa:
        # Create PFA from the estimator
        types = [(var['name'], var['type']['name']) for var in indep_vars]

        # Estimator was not trained on any data
        if not _is_fitted(estimator):
            raise errors.UserError('Model was not fitted on any data, cannot generate PFA.')

        pfa = sklearn_to_pfa(estimator, types, featurizer.generate_pretty_pfa())

        # Add serialized model as metadata
        pfa['metadata'] = _estimator_metadata(estimator, X, y, featurizer)

        model_type = parameters.get_parameter('type', str, 'linear_model')
        pfa['name'] = model_type

        # Save or update job_result
        logging.info('Saving PFA to job_results table')
        pfa = json.dumps(pfa)
        io_helper.save_results(pfa, shapes.Shapes.PFA)
    else:
        # Save or update job_result
        logging.info('Saving serialized estimator into job_results table')
        io_helper.save_results(json.dumps(_estimator_metadata(estimator, X, y, featurizer)), shapes.Shapes.JSON)