def aggregate_stats(job_ids, graph_type=None):
    """Get all partial statistics from all nodes and aggregate them.
    :input job_ids: list of job_ids with intermediate results
    """
    # Read intermediate inputs from jobs
    logging.info("Fetching intermediate data...")
    results = io_helper.load_intermediate_json_results(map(str, job_ids))

    corr, columns, crosstab = _aggregate_results(results)

    graph_type = graph_type or parameters.get_parameter(
        'graph', str, 'correlation_heatmap')

    if graph_type == 'correlation_heatmap':
        fig = _fig_corr_heatmap(corr, columns, crosstab)
    elif graph_type == 'pca':
        # save PCA graphs, but leave out the one with PCA scores
        logging.warning(
            'Sample scores graph is not yet implemented for distributed PCA.')
        fig = _fig_pca(corr, columns, X=None)
    else:
        raise errors.UserError(
            'MODEL_PARAM_graph only supports values `correlation_heatmap` and `pca`'
        )

    logging.info("Results:\n{}".format(fig))
    io_helper.save_results(json.dumps(fig), shapes.Shapes.PLOTLY)
    logging.info("DONE")
def compute(graph_type=None):
    """Perform both intermediate step and aggregation at once."""
    # Read inputs
    logging.info("Fetching data...")
    inputs = io_helper.fetch_data()
    dep_var = inputs["data"]["dependent"][0]
    indep_vars = inputs["data"]["independent"]

    result = _compute_intermediate_result(inputs)
    corr, columns, crosstab = _aggregate_results([result])

    graph_type = graph_type or parameters.get_parameter(
        'graph', str, 'correlation_heatmap')

    if graph_type == 'correlation_heatmap':
        fig = _fig_corr_heatmap(corr, columns, crosstab)
    elif graph_type == 'pca':
        X = io_helper.fetch_dataframe([dep_var] + indep_vars)
        fig = _fig_pca(corr, columns, X)
    else:
        raise errors.UserError(
            'MODEL_PARAM_graph only supports values `correlation_heatmap` and `pca`'
        )

    logging.info("Results:\n{}".format(fig))
    io_helper.save_results(json.dumps(fig), shapes.Shapes.PLOTLY)
    logging.info("DONE")
예제 #3
0
def main(clean_files=False):
    """
    :param clean_files: if True, clean files afterwards
    """
    # Read inputs
    inputs = io_helper.fetch_data()
    data = inputs["data"]

    beam = parameters.get_parameter('beam', int, 10)
    support = parameters.get_parameter('support', float, '0.00001')
    out_file = 'input.csv'
    rules_out_file = 'rules.txt'

    matrix, attributes = preprocess.to_matrix(data)
    preprocess.dump_to_csv(matrix, attributes, out_file)

    # Call hedwig with sensible defaults
    examples_file = out_file

    empty_bk = tempfile.mkdtemp()
    call([
        'python', '-m'
        'hedwig.__main__', empty_bk, examples_file, '--beam',
        str(beam), '--support',
        str(support), '-f', 'csv', '-l', '-o', rules_out_file, '--nocache'
    ])

    with open(rules_out_file) as f:
        results = f.read()

    if clean_files:
        os.remove(out_file)
        os.remove(rules_out_file)

    io_helper.save_results(results.replace('less_than', '<'),
                           shapes.Shapes.TEXT)
예제 #4
0
def main():
    # Configure logging
    logging.basicConfig(level=logging.INFO)

    # Read inputs
    inputs = io_helper.fetch_data()
    dep_var = inputs["data"]["dependent"][0]
    inped_vars = inputs["data"]["independent"]
    design = get_parameter(DESIGN_PARAM, str, DEFAULT_DESIGN)

    # Check dependent variable type (should be continuous)
    if dep_var["type"]["name"] not in ["integer", "real"]:
        raise errors.UserError('Dependent variable should be continuous!')

    # Extract data and parameters from inputs
    data = format_data(inputs["data"])

    # Compute anova and generate PFA output
    anova_results = format_output(
        compute_anova(dep_var, inped_vars, data, design).to_dict())

    # Store results
    io_helper.save_results(anova_results, Shapes.JSON)
예제 #5
0
def main(job_id, generate_pfa):
    inputs = io_helper.fetch_data()
    dep_var = inputs["data"]["dependent"][0]
    indep_vars = inputs["data"]["independent"]

    if dep_var['type']['name'] in ('polynominal', 'binominal'):
        job_type = 'classification'
    else:
        job_type = 'regression'

    # Get existing results with partial model if they exist
    if job_id:
        job_result = io_helper.get_results(job_id=str(job_id))

        logging.info('Loading existing estimator')
        estimator = deserialize_sklearn_estimator(json.loads(job_result.data)['estimator'])
    else:
        logging.info('Creating new estimator')
        estimator = _create_estimator(job_type)

    # featurization
    featurizer = _create_featurizer(indep_vars, estimator)

    # convert variables into dataframe
    X = io_helper.fetch_dataframe(variables=[dep_var] + indep_vars)
    X = utils.remove_nulls(X, errors='ignore')
    y = X.pop(dep_var['name'])

    X = featurizer.transform(X)

    if len(X) == 0:
        # log error, but still save the estimator
        logging.warning("All data are NULL, cannot fit model")
    else:
        # Train single step
        if hasattr(estimator, 'partial_fit'):
            if job_type == 'classification':
                estimator.partial_fit(X, y, classes=dep_var['type']['enumeration'])
            else:
                estimator.partial_fit(X, y)
        else:
            if not generate_pfa:
                logging.warning('{} does not support partial fit.'.format(estimator))
            if isinstance(estimator, GradientBoostingClassifier) and len(set(y)) == 1:
                raise errors.UserError(
                    'All outputs have single category ({}), Gradient boosting cannot fit that.'.format(y.iloc[0])
                )
            estimator.fit(X, y)

    if generate_pfa:
        # Create PFA from the estimator
        types = [(var['name'], var['type']['name']) for var in indep_vars]

        # Estimator was not trained on any data
        if not _is_fitted(estimator):
            raise errors.UserError('Model was not fitted on any data, cannot generate PFA.')

        pfa = sklearn_to_pfa(estimator, types, featurizer.generate_pretty_pfa())

        # Add serialized model as metadata
        pfa['metadata'] = _estimator_metadata(estimator, X, y, featurizer)

        model_type = parameters.get_parameter('type', str, 'linear_model')
        pfa['name'] = model_type

        # Save or update job_result
        logging.info('Saving PFA to job_results table')
        pfa = json.dumps(pfa)
        io_helper.save_results(pfa, shapes.Shapes.PFA)
    else:
        # Save or update job_result
        logging.info('Saving serialized estimator into job_results table')
        io_helper.save_results(json.dumps(_estimator_metadata(estimator, X, y, featurizer)), shapes.Shapes.JSON)