예제 #1
0
def compute_series(dep_var, categories, grouping_var=None):
    series = list()
    has_nulls = pd.isnull(dep_var['series']).any()
    if utils.is_nominal(dep_var):
        if not grouping_var:
            series.append({"name": "all", "data": count(dep_var['series'], categories)})
        else:
            for series_name in grouping_var['type']['enumeration']:
                filtered_data = [v for v, d in zip(dep_var['series'], grouping_var['series']) if d == series_name]
                series.append({"name": series_name, "data": count(filtered_data, categories)})
    else:
        if not grouping_var:
            values = pd.Series(dep_var['series'])

            if has_nulls:
                data = [int(i) for i in histogram(values.dropna(), categories[:-1])[0]] + [int(values.isnull().sum())]
            else:
                data = [int(i) for i in histogram(values, categories)[0]]

            series.append({"name": 'all', "data": data})
        else:
            values = pd.Series(dep_var['series'])
            grouping_values = pd.Series(grouping_var['series']).fillna('No data')

            for series_name in grouping_var['type']['enumeration']:
                filtered_data = pd.Series([v for v, d in zip(values, grouping_values) if d == series_name])
                if has_nulls:
                    data = [int(i) for i in histogram(filtered_data.dropna(), categories[:-1])[0]] + [int(filtered_data.isnull().sum())]
                else:
                    data = [int(i) for i in histogram(filtered_data, categories)[0]]

                series.append({"name": series_name, "data": data})
    return series
예제 #2
0
def error_histograms(dep_var, indep_vars):
    histograms = list()
    if len(dep_var) > 0:
        histograms.append(error_histogram(dep_var))
        grouping_vars = [indep_var for indep_var in indep_vars if utils.is_nominal(indep_var)]
        for grouping_var in grouping_vars:
            histograms.append(error_histogram(dep_var, grouping_var))
    return json.dumps(histograms)
예제 #3
0
def compute_histograms(dep_var, indep_vars, nb_bins=DEFAULT_BINS):
    histograms = list()
    if len(dep_var) > 0:
        histograms.append(compute_histogram(dep_var, nb_bins=nb_bins))
        grouping_vars = [indep_var for indep_var in indep_vars if utils.is_nominal(indep_var)]
        for grouping_var in grouping_vars:
            histograms.append(compute_histogram(dep_var, grouping_var, nb_bins))
    return histograms
예제 #4
0
def _create_featurizer(indep_vars):
    transforms = []
    for var in indep_vars:
        if utils.is_nominal(var):
            transforms.append(
                OneHotEncoding(var['name'], var['type']['enumeration']))
        else:
            transforms.append(DummyTransform(var['name']))
    return Featurizer(transforms)
예제 #5
0
def intermediate():
    # Read inputs
    inputs = io_helper.fetch_data()
    dep_var = inputs["data"]["dependent"][0]
    indep_vars = inputs["data"]["independent"]

    data = io_helper.fetch_dataframe(variables=[dep_var] + indep_vars)
    data = utils.remove_nulls(data, errors='ignore')
    y = data.pop(dep_var['name'])

    featurizer = _create_featurizer(indep_vars)
    X = pd.DataFrame(featurizer.transform(data),
                     columns=featurizer.columns,
                     index=data.index)

    if not indep_vars:
        raise errors.UserError('No covariables selected.')

    # Distributed linear regression only works for continuous variables
    if utils.is_nominal(dep_var):
        raise errors.UserError(
            'Dependent variable must be continuous in distributed mode. Use SGD Regression for '
            'nominal variables instead.')

    if data.empty:
        logging.warning('All values are NAN, returning zero values')
        result = {
            'summary': {},
            'columns': [],
            'means': 0,
            'X^T * X': 0,
            'count': 0,
            'scale': 0,
        }

    else:
        # Compute linear-regression
        X.insert(loc=0, column='intercept', value=1.)
        lm = OLS(y, X)
        flm = lm.fit()
        logging.info(flm.summary())
        output = format_output(flm)

        result = {
            'summary': output,
            'columns': list(X.columns),
            'means': X.mean().values,
            'X^T * X': X.T.values.dot(X.values),
            'count': len(X),
            'scale': flm.scale,
        }

    # Store results
    io_helper.save_results(json.dumps(result), 'application/json')
def compute_categories(dep_var, nb_bins=DEFAULT_BINS):
    values = pd.Series(dep_var['series'])

    if len(values) == 0:
        raise errors.UserError('Dependent variable {} is empty.'.format(
            dep_var['name']))

    # TODO: dep_var['series'] can contain both np.nan (in numerical variables) and None (in nominal), pd.isnull
    # can handle them both
    has_nulls = pd.isnull(dep_var['series']).any()
    if utils.is_nominal(dep_var):
        categories = [str(c) for c in dep_var['type']['enumeration']]
        if 'enumeration_labels' in dep_var['type']:
            categories_labels = [
                str(c) for c in dep_var['type']['enumeration_labels']
            ]
        else:
            categories_labels = categories

        if has_nulls:
            categories.append('None')
            categories_labels.append('No data')
    else:
        # calculate min and max if not available in variable (ignore null values)
        values = pd.Series(dep_var['series'])
        minimum = dep_var.get('minValue', values.min())
        maximum = dep_var.get('maxValue', values.max())

        all_nulls = values.isnull().all()

        if all_nulls:
            categories = ['None']
            categories_labels = ['No data']
        else:
            if utils.is_integer(dep_var):
                step = math.ceil((maximum - minimum) / nb_bins)
                categories = list(arange(minimum, maximum, step).tolist())
                categories_labels = [
                    "%d - %d" % (v, v + step) for v in categories
                ]
            else:
                step = (maximum - minimum) / nb_bins
                categories = list(arange(minimum, maximum, step).tolist())
                categories_labels = [
                    "%s - %s" % ("{:.2f}".format(v), "{:.2f}".format(v + step))
                    for v in categories
                ]
                categories.append(categories[-1] + step)

            if has_nulls:
                categories.append('None')
                categories_labels.append('No data')

    return categories, categories_labels
def _compute_intermediate_result(inputs):
    dep_var = inputs["data"]["dependent"][0]
    indep_vars = inputs["data"]["independent"]

    nominal_vars = []
    numeric_vars = []
    for var in [dep_var] + indep_vars:
        if utils.is_nominal(var):
            nominal_vars.append(var['name'])
        else:
            numeric_vars.append(var['name'])

    # Load data into a Pandas dataframe
    logging.info("Loading data...")
    X = io_helper.fetch_dataframe(variables=[dep_var] + indep_vars)

    logging.info('Dropping NULL values')
    X = utils.remove_nulls(X, errors='ignore')

    # Generate results
    logging.info("Generating results...")
    result = {
        'columns': numeric_vars,
        'nominal_columns': nominal_vars,
    }
    if len(X):
        result.update({
            'means':
            X[numeric_vars].mean().values,
            'X^T * X':
            X[numeric_vars].T.dot(X[numeric_vars].values).values,
            'count':
            len(X),
        })
        if nominal_vars:
            result['crosstab'] = X[nominal_vars].groupby(nominal_vars).size()\
                                                .reset_index()\
                                                .rename(columns={0: 'count'})\
                                                .to_dict(orient='records')
        else:
            result['crosstab'] = []
    else:
        logging.warning('All values are NAN, returning zero values')
        k = len(result['columns'])
        result.update({
            'means': np.zeros(k),
            'X^T * X': np.zeros((k, k)),
            'count': 0,
            'crosstab': [],
        })
    return result
예제 #8
0
def intermediate_stats():
    """Calculate summary statistics for single node."""
    # Read inputs
    logging.info("Fetching data...")
    inputs = io_helper.fetch_data()
    dep_var = inputs["data"]["dependent"][0]
    indep_vars = inputs["data"]["independent"]
    labels = _get_labels(indep_vars + [dep_var])
    types = _get_types(indep_vars + [dep_var])

    if len(dep_var['series']) == 0:
        logging.warning(
            'Dependent variable has no values, check your SQL query.')

    # Load data into a Pandas dataframe
    logging.info("Loading data...")
    df = io_helper.fetch_dataframe(variables=[dep_var] + indep_vars)

    # Generate results
    logging.info("Generating results...")

    group_variables = [
        var['name'] for var in indep_vars if utils.is_nominal(var)
    ]

    # grouped statistics
    data = []
    if group_variables:
        for group_name, group in df.groupby(group_variables):
            # if there's only one nominal column
            if not isinstance(group_name, tuple):
                group_name = (group_name, )

            data += _calc_stats(group, group_name, group_variables, labels,
                                types)

    # overall statistics
    data += _calc_stats(df, ('all', ), [], labels, types)

    logging.info("Results:\n{}".format(data))
    table = {
        'schema': OUTPUT_SCHEMA_INTERMEDIATE,
        'data': data,
    }
    io_helper.save_results(pd.io.json.dumps(table),
                           shapes.Shapes.TABULAR_DATA_RESOURCE)
    logging.info("DONE")
def _compute_intermediate_result(inputs):
    dep_var = inputs["data"]["dependent"][0]
    indep_vars = inputs["data"]["independent"]

    # Use only numeric variables
    variables = []
    for var in [dep_var] + indep_vars:
        if utils.is_nominal(var):
            logging.warning('Correlation heatmap works only with numerical types ({} is {})'.format(var['name'], var['type']['name']))
        else:
            variables.append(var)

    # Load data into a Pandas dataframe
    logging.info("Loading data...")
    X = io_helper.fetch_dataframe(variables=variables)

    logging.info('Dropping NULL values')
    X = utils.remove_nulls(X, errors='ignore')

    # Generate results
    logging.info("Generating results...")
    if len(X):
        result = {
            'columns': list(X.columns),
            'means': X.mean().values,
            'X^T * X': X.T.dot(X.values).values,
            'count': len(X),
        }
    else:
        logging.warning('All values are NAN, returning zero values')
        k = X.shape[1]
        result = {
            'columns': list(X.columns),
            'means': np.zeros(k),
            'X^T * X': np.zeros((k, k)),
            'count': 0,
        }
    return result
예제 #10
0
def main():
    # Read inputs
    inputs = io_helper.fetch_data()
    dep_var = inputs["data"]["dependent"][0]
    indep_vars = inputs["data"]["independent"]

    if not indep_vars:
        raise errors.UserError('No covariables selected.')

    if utils.is_nominal(dep_var):
        job_type = 'classification'
    else:
        job_type = 'regression'

    data = io_helper.fetch_dataframe(variables=[dep_var] + indep_vars)
    data = utils.remove_nulls(data, errors='ignore')
    y = data.pop(dep_var['name'])

    featurizer = _create_featurizer(indep_vars)
    X = pd.DataFrame(featurizer.transform(data), columns=featurizer.columns)

    if X.empty:
        logging.warning('All values are NAN, returning zero values')
        result = {}
        pfa = None

    else:
        # Add intercept
        X.insert(loc=0, column='intercept', value=1.)

        # Remove linearly dependent columns
        X = X.iloc[:, _independent_columns(X)]

        # Fit regresssion
        if job_type == 'regression':
            result, metadata = _fit_regression(X, y)

            # Generate PFA for predictions
            pfa = _generate_pfa_regressor(result, indep_vars, featurizer)

        elif job_type == 'classification':
            # Run one-vs-others for each class
            result = {}
            metadata = {}
            for cat in y.cat.categories:
                r, m = _fit_logit(X, y == cat)
                result[cat] = r
                metadata[cat] = m

            if all(result[cat]['intercept']['coef'] is None
                   for cat in y.cat.categories):
                raise errors.UserError(
                    'Not enough data to apply logistic regression.')

            # Generate PFA for predictions
            pfa = _generate_pfa_classifier(result, indep_vars, featurizer,
                                           y.cat.categories)

        # Add metadata from model
        pfa['metadata'] = metadata

        # TODO: save multiple outputs - PFA and coeficients

    # Store results
    io_helper.save_results(json.dumps(result), 'application/json')