def main(): # configure logging logging.basicConfig(level=logging.INFO) logging.info(cf_netSDM) # Read inputs data = io_helper.fetch_data()['data'] X = io_helper.fetch_dataframe(data['independent']) y = io_helper.fetch_dataframe(data['dependent']).iloc[:, 0] if len(X) >= 2000: logging.warning( 'HINMine runs in quadratic time, processing {} samples could be very slow.' .format(len(X))) normalize = parameters.get_param('normalize', bool, 'True') damping = parameters.get_param('damping', float, '0.85') if normalize: X = X.apply(lambda x: x / np.linalg.norm(x)) network = construct_adjacency_graph(range(len(X)), X.values, y.values) propositionalized = timecall(cf_netSDM.hinmine_propositionalize)( network, damping)['train_features']['data'] results_dict = _construct_results(propositionalized) io_helper.save_results(json.dumps(results_dict), shapes.Shapes.TABULAR_DATA_RESOURCE)
def compute(): """Create PFA for kNN.""" # Read intermediate inputs from jobs logging.info("Fetching intermediate data...") inputs = io_helper.fetch_data() indep_vars = inputs["data"]["independent"] # Extract hyperparameters from ENV variables k = parameters.get_param('n_clusters', int, DEFAULT_N_CLUSTERS) # featurization featurizer = _create_featurizer(indep_vars) # convert variables into dataframe X = io_helper.fetch_dataframe(variables=indep_vars) X = utils.remove_nulls(X, errors='ignore') X = featurizer.transform(X) estimator = KMeans(n_clusters=k) estimator.fit(X) # Generate PFA for kmeans types = [(var['name'], var['type']['name']) for var in indep_vars] pfa = sklearn_to_pfa(estimator, types, featurizer.generate_pretty_pfa()) # Add centroids as metadata pfa['metadata'] = { 'centroids': json.dumps(estimator.cluster_centers_.tolist()) } # Save or update job_result logging.info('Saving PFA to job_results table') io_helper.save_results(json.dumps(pfa), shapes.Shapes.PFA) logging.info("DONE")
def compute(): """Create PFA for kNN.""" inputs = io_helper.fetch_data() dep_var = inputs["data"]["dependent"][0] indep_vars = inputs["data"]["independent"] params = parameters.fetch_parameters() if dep_var['type']['name'] in ('polynominal', 'binominal'): job_type = 'classification' else: job_type = 'regression' logging.info('Creating new estimator') estimator = _create_estimator(job_type, params) featurizer = _create_featurizer(indep_vars) # convert variables into dataframe X = io_helper.fetch_dataframe(variables=[dep_var] + indep_vars) X = utils.remove_nulls(X) y = X.pop(dep_var['name']) X = featurizer.transform(X) # Drop NaN values estimator.fit(X, y) # Create PFA from the estimator types = [(var['name'], var['type']['name']) for var in indep_vars] pfa = sklearn_to_pfa(estimator, types, featurizer.generate_pretty_pfa()) pfa['name'] = "kNN" # Save or update job_result logging.info('Saving PFA to job_results table...') pfa = json.dumps(pfa) io_helper.save_results(pfa, shapes.Shapes.PFA)
def compute(graph_type=None): """Perform both intermediate step and aggregation at once.""" # Read inputs logging.info("Fetching data...") inputs = io_helper.fetch_data() dep_var = inputs["data"]["dependent"][0] indep_vars = inputs["data"]["independent"] result = _compute_intermediate_result(inputs) corr, columns, crosstab = _aggregate_results([result]) graph_type = graph_type or parameters.get_parameter( 'graph', str, 'correlation_heatmap') if graph_type == 'correlation_heatmap': fig = _fig_corr_heatmap(corr, columns, crosstab) elif graph_type == 'pca': X = io_helper.fetch_dataframe([dep_var] + indep_vars) fig = _fig_pca(corr, columns, X) else: raise errors.UserError( 'MODEL_PARAM_graph only supports values `correlation_heatmap` and `pca`' ) logging.info("Results:\n{}".format(fig)) io_helper.save_results(json.dumps(fig), shapes.Shapes.PLOTLY) logging.info("DONE")
def intermediate(): # Read inputs inputs = io_helper.fetch_data() dep_var = inputs["data"]["dependent"][0] indep_vars = inputs["data"]["independent"] data = io_helper.fetch_dataframe(variables=[dep_var] + indep_vars) data = utils.remove_nulls(data, errors='ignore') y = data.pop(dep_var['name']) featurizer = _create_featurizer(indep_vars) X = pd.DataFrame(featurizer.transform(data), columns=featurizer.columns, index=data.index) if not indep_vars: raise errors.UserError('No covariables selected.') # Distributed linear regression only works for continuous variables if utils.is_nominal(dep_var): raise errors.UserError( 'Dependent variable must be continuous in distributed mode. Use SGD Regression for ' 'nominal variables instead.') if data.empty: logging.warning('All values are NAN, returning zero values') result = { 'summary': {}, 'columns': [], 'means': 0, 'X^T * X': 0, 'count': 0, 'scale': 0, } else: # Compute linear-regression X.insert(loc=0, column='intercept', value=1.) lm = OLS(y, X) flm = lm.fit() logging.info(flm.summary()) output = format_output(flm) result = { 'summary': output, 'columns': list(X.columns), 'means': X.mean().values, 'X^T * X': X.T.values.dot(X.values), 'count': len(X), 'scale': flm.scale, } # Store results io_helper.save_results(json.dumps(result), 'application/json')
def _compute_intermediate_result(inputs): dep_var = inputs["data"]["dependent"][0] indep_vars = inputs["data"]["independent"] nominal_vars = [] numeric_vars = [] for var in [dep_var] + indep_vars: if utils.is_nominal(var): nominal_vars.append(var['name']) else: numeric_vars.append(var['name']) # Load data into a Pandas dataframe logging.info("Loading data...") X = io_helper.fetch_dataframe(variables=[dep_var] + indep_vars) logging.info('Dropping NULL values') X = utils.remove_nulls(X, errors='ignore') # Generate results logging.info("Generating results...") result = { 'columns': numeric_vars, 'nominal_columns': nominal_vars, } if len(X): result.update({ 'means': X[numeric_vars].mean().values, 'X^T * X': X[numeric_vars].T.dot(X[numeric_vars].values).values, 'count': len(X), }) if nominal_vars: result['crosstab'] = X[nominal_vars].groupby(nominal_vars).size()\ .reset_index()\ .rename(columns={0: 'count'})\ .to_dict(orient='records') else: result['crosstab'] = [] else: logging.warning('All values are NAN, returning zero values') k = len(result['columns']) result.update({ 'means': np.zeros(k), 'X^T * X': np.zeros((k, k)), 'count': 0, 'crosstab': [], }) return result
def intermediate_stats(): """Calculate summary statistics for single node.""" # Read inputs logging.info("Fetching data...") inputs = io_helper.fetch_data() dep_var = inputs["data"]["dependent"][0] indep_vars = inputs["data"]["independent"] labels = _get_labels(indep_vars + [dep_var]) types = _get_types(indep_vars + [dep_var]) if len(dep_var['series']) == 0: logging.warning( 'Dependent variable has no values, check your SQL query.') # Load data into a Pandas dataframe logging.info("Loading data...") df = io_helper.fetch_dataframe(variables=[dep_var] + indep_vars) # Generate results logging.info("Generating results...") group_variables = [ var['name'] for var in indep_vars if utils.is_nominal(var) ] # grouped statistics data = [] if group_variables: for group_name, group in df.groupby(group_variables): # if there's only one nominal column if not isinstance(group_name, tuple): group_name = (group_name, ) data += _calc_stats(group, group_name, group_variables, labels, types) # overall statistics data += _calc_stats(df, ('all', ), [], labels, types) logging.info("Results:\n{}".format(data)) table = { 'schema': OUTPUT_SCHEMA_INTERMEDIATE, 'data': data, } io_helper.save_results(pd.io.json.dumps(table), shapes.Shapes.TABULAR_DATA_RESOURCE) logging.info("DONE")
def test_fetch_dataframe(mock_read_sql_query): data = pd.DataFrame({ 'lefthippocampus': [1., 2.], 'subjectageyears': [20, 30], }) mock_read_sql_query.return_value = data with mock_engine(): df = fetch_dataframe() assert df.to_dict(orient='records') == [{ 'subjectageyears': 20.0, 'lefthippocampus': 1.0 }, { 'subjectageyears': 30.0, 'lefthippocampus': 2.0 }]
def _compute_intermediate_result(inputs): dep_var = inputs["data"]["dependent"][0] indep_vars = inputs["data"]["independent"] # Use only numeric variables variables = [] for var in [dep_var] + indep_vars: if utils.is_nominal(var): logging.warning('Correlation heatmap works only with numerical types ({} is {})'.format(var['name'], var['type']['name'])) else: variables.append(var) # Load data into a Pandas dataframe logging.info("Loading data...") X = io_helper.fetch_dataframe(variables=variables) logging.info('Dropping NULL values') X = utils.remove_nulls(X, errors='ignore') # Generate results logging.info("Generating results...") if len(X): result = { 'columns': list(X.columns), 'means': X.mean().values, 'X^T * X': X.T.dot(X.values).values, 'count': len(X), } else: logging.warning('All values are NAN, returning zero values') k = X.shape[1] result = { 'columns': list(X.columns), 'means': np.zeros(k), 'X^T * X': np.zeros((k, k)), 'count': 0, } return result
def intermediate_kmeans(): """Calculate k-Means locally.""" # Read inputs logging.info("Fetching data...") inputs = io_helper.fetch_data() indep_vars = inputs["data"]["independent"] # Extract hyperparameters from ENV variables k = parameters.get_param('n_clusters', int, DEFAULT_N_CLUSTERS) # Load data into a Pandas dataframe logging.info("Loading data...") X = io_helper.fetch_dataframe(variables=indep_vars) # Return variables info, but remove actual data points results = {'indep_vars': []} for var in indep_vars: if var['type']['name'] in ('integer', 'real'): new_var = {k: v for k, v in var.items() if k != 'series'} mean, std = _get_moments(var) new_var['mean'] = mean new_var['std'] = std else: new_var = var results['indep_vars'].append(new_var) # Drop NaN values X = utils.remove_nulls(X, errors='ignore') if len(X) == 0: logging.warning("All data are NULL, returning empty centroids.") results['centroids'] = [] io_helper.save_results(json.dumps(results), shapes.Shapes.JSON) return # Generate results logging.info("Generating results...") # featurization featurizer = _create_featurizer(indep_vars) X = featurizer.transform(X) m, n = X.shape num_iter = 0 not_converged = True # Run k-Means locally # Have each site compute k initial clusters locally local_centroids = local.initialize_own_centroids(X, k) # Local Optimization Loop while not_converged: # Each local site computes its cluster cluster_labels = local.compute_clustering(X, local_centroids) if OPTIMIZATION == 'lloyd': # Computes its local mean if doing lloyd, and updates centroids local_means = local.compute_mean(X, cluster_labels, k) local_centroids, previous_centroids = local.mean_step( local_means, local_centroids) elif OPTIMIZATION == 'gradient': # Computes the local gradient if doing GD, and takes a GD step local_grad = local.compute_gradient(X, cluster_labels, local_centroids, LR) local_centroids, previous_centroids = local.gradient_step( local_grad, local_centroids) # Check local stopping conditions not_converged, local_delta = local.check_stopping( local_centroids, previous_centroids, EPSILON) num_iter += 1 logging.info("Single-Shot {} ; iter : {} delta : {}".format( OPTIMIZATION, num_iter, local_delta)) results['centroids'] = [lc.tolist() for lc in local_centroids] logging.info("Results:\n{}".format(results)) io_helper.save_results(json.dumps(results), shapes.Shapes.JSON) logging.info("DONE")
def main(job_id, generate_pfa): inputs = io_helper.fetch_data() dep_var = inputs["data"]["dependent"][0] indep_vars = inputs["data"]["independent"] if dep_var['type']['name'] in ('polynominal', 'binominal'): job_type = 'classification' else: job_type = 'regression' # Get existing results with partial model if they exist if job_id: job_result = io_helper.get_results(job_id=str(job_id)) logging.info('Loading existing estimator') estimator = deserialize_sklearn_estimator(json.loads(job_result.data)['estimator']) else: logging.info('Creating new estimator') estimator = _create_estimator(job_type) # featurization featurizer = _create_featurizer(indep_vars, estimator) # convert variables into dataframe X = io_helper.fetch_dataframe(variables=[dep_var] + indep_vars) X = utils.remove_nulls(X, errors='ignore') y = X.pop(dep_var['name']) X = featurizer.transform(X) if len(X) == 0: # log error, but still save the estimator logging.warning("All data are NULL, cannot fit model") else: # Train single step if hasattr(estimator, 'partial_fit'): if job_type == 'classification': estimator.partial_fit(X, y, classes=dep_var['type']['enumeration']) else: estimator.partial_fit(X, y) else: if not generate_pfa: logging.warning('{} does not support partial fit.'.format(estimator)) if isinstance(estimator, GradientBoostingClassifier) and len(set(y)) == 1: raise errors.UserError( 'All outputs have single category ({}), Gradient boosting cannot fit that.'.format(y.iloc[0]) ) estimator.fit(X, y) if generate_pfa: # Create PFA from the estimator types = [(var['name'], var['type']['name']) for var in indep_vars] # Estimator was not trained on any data if not _is_fitted(estimator): raise errors.UserError('Model was not fitted on any data, cannot generate PFA.') pfa = sklearn_to_pfa(estimator, types, featurizer.generate_pretty_pfa()) # Add serialized model as metadata pfa['metadata'] = _estimator_metadata(estimator, X, y, featurizer) model_type = parameters.get_parameter('type', str, 'linear_model') pfa['name'] = model_type # Save or update job_result logging.info('Saving PFA to job_results table') pfa = json.dumps(pfa) io_helper.save_results(pfa, shapes.Shapes.PFA) else: # Save or update job_result logging.info('Saving serialized estimator into job_results table') io_helper.save_results(json.dumps(_estimator_metadata(estimator, X, y, featurizer)), shapes.Shapes.JSON)
def main(): # Read inputs inputs = io_helper.fetch_data() dep_var = inputs["data"]["dependent"][0] indep_vars = inputs["data"]["independent"] if not indep_vars: raise errors.UserError('No covariables selected.') if utils.is_nominal(dep_var): job_type = 'classification' else: job_type = 'regression' data = io_helper.fetch_dataframe(variables=[dep_var] + indep_vars) data = utils.remove_nulls(data, errors='ignore') y = data.pop(dep_var['name']) featurizer = _create_featurizer(indep_vars) X = pd.DataFrame(featurizer.transform(data), columns=featurizer.columns) if X.empty: logging.warning('All values are NAN, returning zero values') result = {} pfa = None else: # Add intercept X.insert(loc=0, column='intercept', value=1.) # Remove linearly dependent columns X = X.iloc[:, _independent_columns(X)] # Fit regresssion if job_type == 'regression': result, metadata = _fit_regression(X, y) # Generate PFA for predictions pfa = _generate_pfa_regressor(result, indep_vars, featurizer) elif job_type == 'classification': # Run one-vs-others for each class result = {} metadata = {} for cat in y.cat.categories: r, m = _fit_logit(X, y == cat) result[cat] = r metadata[cat] = m if all(result[cat]['intercept']['coef'] is None for cat in y.cat.categories): raise errors.UserError( 'Not enough data to apply logistic regression.') # Generate PFA for predictions pfa = _generate_pfa_classifier(result, indep_vars, featurizer, y.cat.categories) # Add metadata from model pfa['metadata'] = metadata # TODO: save multiple outputs - PFA and coeficients # Store results io_helper.save_results(json.dumps(result), 'application/json')