def compute_series(dep_var, categories, grouping_var=None): series = list() has_nulls = pd.isnull(dep_var['series']).any() if utils.is_nominal(dep_var): if not grouping_var: series.append({"name": "all", "data": count(dep_var['series'], categories)}) else: for series_name in grouping_var['type']['enumeration']: filtered_data = [v for v, d in zip(dep_var['series'], grouping_var['series']) if d == series_name] series.append({"name": series_name, "data": count(filtered_data, categories)}) else: if not grouping_var: values = pd.Series(dep_var['series']) if has_nulls: data = [int(i) for i in histogram(values.dropna(), categories[:-1])[0]] + [int(values.isnull().sum())] else: data = [int(i) for i in histogram(values, categories)[0]] series.append({"name": 'all', "data": data}) else: values = pd.Series(dep_var['series']) grouping_values = pd.Series(grouping_var['series']).fillna('No data') for series_name in grouping_var['type']['enumeration']: filtered_data = pd.Series([v for v, d in zip(values, grouping_values) if d == series_name]) if has_nulls: data = [int(i) for i in histogram(filtered_data.dropna(), categories[:-1])[0]] + [int(filtered_data.isnull().sum())] else: data = [int(i) for i in histogram(filtered_data, categories)[0]] series.append({"name": series_name, "data": data}) return series
def error_histograms(dep_var, indep_vars): histograms = list() if len(dep_var) > 0: histograms.append(error_histogram(dep_var)) grouping_vars = [indep_var for indep_var in indep_vars if utils.is_nominal(indep_var)] for grouping_var in grouping_vars: histograms.append(error_histogram(dep_var, grouping_var)) return json.dumps(histograms)
def compute_histograms(dep_var, indep_vars, nb_bins=DEFAULT_BINS): histograms = list() if len(dep_var) > 0: histograms.append(compute_histogram(dep_var, nb_bins=nb_bins)) grouping_vars = [indep_var for indep_var in indep_vars if utils.is_nominal(indep_var)] for grouping_var in grouping_vars: histograms.append(compute_histogram(dep_var, grouping_var, nb_bins)) return histograms
def _create_featurizer(indep_vars): transforms = [] for var in indep_vars: if utils.is_nominal(var): transforms.append( OneHotEncoding(var['name'], var['type']['enumeration'])) else: transforms.append(DummyTransform(var['name'])) return Featurizer(transforms)
def intermediate(): # Read inputs inputs = io_helper.fetch_data() dep_var = inputs["data"]["dependent"][0] indep_vars = inputs["data"]["independent"] data = io_helper.fetch_dataframe(variables=[dep_var] + indep_vars) data = utils.remove_nulls(data, errors='ignore') y = data.pop(dep_var['name']) featurizer = _create_featurizer(indep_vars) X = pd.DataFrame(featurizer.transform(data), columns=featurizer.columns, index=data.index) if not indep_vars: raise errors.UserError('No covariables selected.') # Distributed linear regression only works for continuous variables if utils.is_nominal(dep_var): raise errors.UserError( 'Dependent variable must be continuous in distributed mode. Use SGD Regression for ' 'nominal variables instead.') if data.empty: logging.warning('All values are NAN, returning zero values') result = { 'summary': {}, 'columns': [], 'means': 0, 'X^T * X': 0, 'count': 0, 'scale': 0, } else: # Compute linear-regression X.insert(loc=0, column='intercept', value=1.) lm = OLS(y, X) flm = lm.fit() logging.info(flm.summary()) output = format_output(flm) result = { 'summary': output, 'columns': list(X.columns), 'means': X.mean().values, 'X^T * X': X.T.values.dot(X.values), 'count': len(X), 'scale': flm.scale, } # Store results io_helper.save_results(json.dumps(result), 'application/json')
def compute_categories(dep_var, nb_bins=DEFAULT_BINS): values = pd.Series(dep_var['series']) if len(values) == 0: raise errors.UserError('Dependent variable {} is empty.'.format( dep_var['name'])) # TODO: dep_var['series'] can contain both np.nan (in numerical variables) and None (in nominal), pd.isnull # can handle them both has_nulls = pd.isnull(dep_var['series']).any() if utils.is_nominal(dep_var): categories = [str(c) for c in dep_var['type']['enumeration']] if 'enumeration_labels' in dep_var['type']: categories_labels = [ str(c) for c in dep_var['type']['enumeration_labels'] ] else: categories_labels = categories if has_nulls: categories.append('None') categories_labels.append('No data') else: # calculate min and max if not available in variable (ignore null values) values = pd.Series(dep_var['series']) minimum = dep_var.get('minValue', values.min()) maximum = dep_var.get('maxValue', values.max()) all_nulls = values.isnull().all() if all_nulls: categories = ['None'] categories_labels = ['No data'] else: if utils.is_integer(dep_var): step = math.ceil((maximum - minimum) / nb_bins) categories = list(arange(minimum, maximum, step).tolist()) categories_labels = [ "%d - %d" % (v, v + step) for v in categories ] else: step = (maximum - minimum) / nb_bins categories = list(arange(minimum, maximum, step).tolist()) categories_labels = [ "%s - %s" % ("{:.2f}".format(v), "{:.2f}".format(v + step)) for v in categories ] categories.append(categories[-1] + step) if has_nulls: categories.append('None') categories_labels.append('No data') return categories, categories_labels
def _compute_intermediate_result(inputs): dep_var = inputs["data"]["dependent"][0] indep_vars = inputs["data"]["independent"] nominal_vars = [] numeric_vars = [] for var in [dep_var] + indep_vars: if utils.is_nominal(var): nominal_vars.append(var['name']) else: numeric_vars.append(var['name']) # Load data into a Pandas dataframe logging.info("Loading data...") X = io_helper.fetch_dataframe(variables=[dep_var] + indep_vars) logging.info('Dropping NULL values') X = utils.remove_nulls(X, errors='ignore') # Generate results logging.info("Generating results...") result = { 'columns': numeric_vars, 'nominal_columns': nominal_vars, } if len(X): result.update({ 'means': X[numeric_vars].mean().values, 'X^T * X': X[numeric_vars].T.dot(X[numeric_vars].values).values, 'count': len(X), }) if nominal_vars: result['crosstab'] = X[nominal_vars].groupby(nominal_vars).size()\ .reset_index()\ .rename(columns={0: 'count'})\ .to_dict(orient='records') else: result['crosstab'] = [] else: logging.warning('All values are NAN, returning zero values') k = len(result['columns']) result.update({ 'means': np.zeros(k), 'X^T * X': np.zeros((k, k)), 'count': 0, 'crosstab': [], }) return result
def intermediate_stats(): """Calculate summary statistics for single node.""" # Read inputs logging.info("Fetching data...") inputs = io_helper.fetch_data() dep_var = inputs["data"]["dependent"][0] indep_vars = inputs["data"]["independent"] labels = _get_labels(indep_vars + [dep_var]) types = _get_types(indep_vars + [dep_var]) if len(dep_var['series']) == 0: logging.warning( 'Dependent variable has no values, check your SQL query.') # Load data into a Pandas dataframe logging.info("Loading data...") df = io_helper.fetch_dataframe(variables=[dep_var] + indep_vars) # Generate results logging.info("Generating results...") group_variables = [ var['name'] for var in indep_vars if utils.is_nominal(var) ] # grouped statistics data = [] if group_variables: for group_name, group in df.groupby(group_variables): # if there's only one nominal column if not isinstance(group_name, tuple): group_name = (group_name, ) data += _calc_stats(group, group_name, group_variables, labels, types) # overall statistics data += _calc_stats(df, ('all', ), [], labels, types) logging.info("Results:\n{}".format(data)) table = { 'schema': OUTPUT_SCHEMA_INTERMEDIATE, 'data': data, } io_helper.save_results(pd.io.json.dumps(table), shapes.Shapes.TABULAR_DATA_RESOURCE) logging.info("DONE")
def _compute_intermediate_result(inputs): dep_var = inputs["data"]["dependent"][0] indep_vars = inputs["data"]["independent"] # Use only numeric variables variables = [] for var in [dep_var] + indep_vars: if utils.is_nominal(var): logging.warning('Correlation heatmap works only with numerical types ({} is {})'.format(var['name'], var['type']['name'])) else: variables.append(var) # Load data into a Pandas dataframe logging.info("Loading data...") X = io_helper.fetch_dataframe(variables=variables) logging.info('Dropping NULL values') X = utils.remove_nulls(X, errors='ignore') # Generate results logging.info("Generating results...") if len(X): result = { 'columns': list(X.columns), 'means': X.mean().values, 'X^T * X': X.T.dot(X.values).values, 'count': len(X), } else: logging.warning('All values are NAN, returning zero values') k = X.shape[1] result = { 'columns': list(X.columns), 'means': np.zeros(k), 'X^T * X': np.zeros((k, k)), 'count': 0, } return result
def main(): # Read inputs inputs = io_helper.fetch_data() dep_var = inputs["data"]["dependent"][0] indep_vars = inputs["data"]["independent"] if not indep_vars: raise errors.UserError('No covariables selected.') if utils.is_nominal(dep_var): job_type = 'classification' else: job_type = 'regression' data = io_helper.fetch_dataframe(variables=[dep_var] + indep_vars) data = utils.remove_nulls(data, errors='ignore') y = data.pop(dep_var['name']) featurizer = _create_featurizer(indep_vars) X = pd.DataFrame(featurizer.transform(data), columns=featurizer.columns) if X.empty: logging.warning('All values are NAN, returning zero values') result = {} pfa = None else: # Add intercept X.insert(loc=0, column='intercept', value=1.) # Remove linearly dependent columns X = X.iloc[:, _independent_columns(X)] # Fit regresssion if job_type == 'regression': result, metadata = _fit_regression(X, y) # Generate PFA for predictions pfa = _generate_pfa_regressor(result, indep_vars, featurizer) elif job_type == 'classification': # Run one-vs-others for each class result = {} metadata = {} for cat in y.cat.categories: r, m = _fit_logit(X, y == cat) result[cat] = r metadata[cat] = m if all(result[cat]['intercept']['coef'] is None for cat in y.cat.categories): raise errors.UserError( 'Not enough data to apply logistic regression.') # Generate PFA for predictions pfa = _generate_pfa_classifier(result, indep_vars, featurizer, y.cat.categories) # Add metadata from model pfa['metadata'] = metadata # TODO: save multiple outputs - PFA and coeficients # Store results io_helper.save_results(json.dumps(result), 'application/json')