def main(): # configure logging logging.basicConfig(level=logging.INFO) logging.info(cf_netSDM) # Read inputs data = io_helper.fetch_data()['data'] X = io_helper.fetch_dataframe(data['independent']) y = io_helper.fetch_dataframe(data['dependent']).iloc[:, 0] if len(X) >= 2000: logging.warning( 'HINMine runs in quadratic time, processing {} samples could be very slow.' .format(len(X))) normalize = parameters.get_param('normalize', bool, 'True') damping = parameters.get_param('damping', float, '0.85') if normalize: X = X.apply(lambda x: x / np.linalg.norm(x)) network = construct_adjacency_graph(range(len(X)), X.values, y.values) propositionalized = timecall(cf_netSDM.hinmine_propositionalize)( network, damping)['train_features']['data'] results_dict = _construct_results(propositionalized) io_helper.save_results(json.dumps(results_dict), shapes.Shapes.TABULAR_DATA_RESOURCE)
def compute(graph_type=None): """Perform both intermediate step and aggregation at once.""" # Read inputs logging.info("Fetching data...") inputs = io_helper.fetch_data() dep_var = inputs["data"]["dependent"][0] indep_vars = inputs["data"]["independent"] result = _compute_intermediate_result(inputs) corr, columns, crosstab = _aggregate_results([result]) graph_type = graph_type or parameters.get_parameter( 'graph', str, 'correlation_heatmap') if graph_type == 'correlation_heatmap': fig = _fig_corr_heatmap(corr, columns, crosstab) elif graph_type == 'pca': X = io_helper.fetch_dataframe([dep_var] + indep_vars) fig = _fig_pca(corr, columns, X) else: raise errors.UserError( 'MODEL_PARAM_graph only supports values `correlation_heatmap` and `pca`' ) logging.info("Results:\n{}".format(fig)) io_helper.save_results(json.dumps(fig), shapes.Shapes.PLOTLY) logging.info("DONE")
def compute(): """Create PFA for kNN.""" inputs = io_helper.fetch_data() dep_var = inputs["data"]["dependent"][0] indep_vars = inputs["data"]["independent"] params = parameters.fetch_parameters() if dep_var['type']['name'] in ('polynominal', 'binominal'): job_type = 'classification' else: job_type = 'regression' logging.info('Creating new estimator') estimator = _create_estimator(job_type, params) featurizer = _create_featurizer(indep_vars) # convert variables into dataframe X = io_helper.fetch_dataframe(variables=[dep_var] + indep_vars) X = utils.remove_nulls(X) y = X.pop(dep_var['name']) X = featurizer.transform(X) # Drop NaN values estimator.fit(X, y) # Create PFA from the estimator types = [(var['name'], var['type']['name']) for var in indep_vars] pfa = sklearn_to_pfa(estimator, types, featurizer.generate_pretty_pfa()) pfa['name'] = "kNN" # Save or update job_result logging.info('Saving PFA to job_results table...') pfa = json.dumps(pfa) io_helper.save_results(pfa, shapes.Shapes.PFA)
def compute(): """Create PFA for kNN.""" # Read intermediate inputs from jobs logging.info("Fetching intermediate data...") inputs = io_helper.fetch_data() indep_vars = inputs["data"]["independent"] # Extract hyperparameters from ENV variables k = parameters.get_param('n_clusters', int, DEFAULT_N_CLUSTERS) # featurization featurizer = _create_featurizer(indep_vars) # convert variables into dataframe X = io_helper.fetch_dataframe(variables=indep_vars) X = utils.remove_nulls(X, errors='ignore') X = featurizer.transform(X) estimator = KMeans(n_clusters=k) estimator.fit(X) # Generate PFA for kmeans types = [(var['name'], var['type']['name']) for var in indep_vars] pfa = sklearn_to_pfa(estimator, types, featurizer.generate_pretty_pfa()) # Add centroids as metadata pfa['metadata'] = { 'centroids': json.dumps(estimator.cluster_centers_.tolist()) } # Save or update job_result logging.info('Saving PFA to job_results table') io_helper.save_results(json.dumps(pfa), shapes.Shapes.PFA) logging.info("DONE")
def intermediate_stats(): """Calculate X*X^T, means and count for single node that will be later used to construct covariance matrix.""" # Read inputs logging.info("Fetching data...") inputs = io_helper.fetch_data() result = _compute_intermediate_result(inputs) io_helper.save_results(json.dumps(result), shapes.Shapes.JSON) logging.info("DONE")
def compute(): """Perform both intermediate step and aggregation at once.""" # Read inputs logging.info("Fetching data...") inputs = io_helper.fetch_data() result = _compute_intermediate_result(inputs) corr, columns = _aggregate_results([result]) _save_corr_heatmap(corr, columns)
def intermediate(): # Read inputs inputs = io_helper.fetch_data() dep_var = inputs["data"]["dependent"][0] indep_vars = inputs["data"]["independent"] data = io_helper.fetch_dataframe(variables=[dep_var] + indep_vars) data = utils.remove_nulls(data, errors='ignore') y = data.pop(dep_var['name']) featurizer = _create_featurizer(indep_vars) X = pd.DataFrame(featurizer.transform(data), columns=featurizer.columns, index=data.index) if not indep_vars: raise errors.UserError('No covariables selected.') # Distributed linear regression only works for continuous variables if utils.is_nominal(dep_var): raise errors.UserError( 'Dependent variable must be continuous in distributed mode. Use SGD Regression for ' 'nominal variables instead.') if data.empty: logging.warning('All values are NAN, returning zero values') result = { 'summary': {}, 'columns': [], 'means': 0, 'X^T * X': 0, 'count': 0, 'scale': 0, } else: # Compute linear-regression X.insert(loc=0, column='intercept', value=1.) lm = OLS(y, X) flm = lm.fit() logging.info(flm.summary()) output = format_output(flm) result = { 'summary': output, 'columns': list(X.columns), 'means': X.mean().values, 'X^T * X': X.T.values.dot(X.values), 'count': len(X), 'scale': flm.scale, } # Store results io_helper.save_results(json.dumps(result), 'application/json')
def main(): # configure logging logging.basicConfig(level=logging.INFO) logging.info(cf_netSDM) # Read inputs inputs = io_helper.fetch_data() data = inputs['data'] normalize = parameters.get_param('normalize', bool, 'True') damping = parameters.get_param('damping', float, '0.85') data_array = np.zeros( (len(data['independent'][0]['series']), len(data['independent']))) col_number = 0 row_number = 0 for var in data['independent']: for value in var['series']: data_array[row_number, col_number] = value row_number += 1 col_number += 1 row_number = 0 if normalize: for col_number in range(data_array.shape[1]): data_array[:, col_number] = data_array[:, col_number] / np.linalg.norm( data_array[:, col_number]) network = construct_adjacency_graph(range(data_array.shape[0]), data_array, data['dependent'][0]['series']) propositionalized = cf_netSDM.hinmine_propositionalize( network, damping)['train_features']['data'] results_dict = { 'profile': 'tabular-data-resource', 'name': 'hinmine-features', 'data': [], 'schema': { 'fields': [], 'primaryKey': 'id' } } n = propositionalized.shape[0] for row_index in range(n): instance = {"id": row_index} for col_index in range(n): instance["feature_%i" % (col_index + 1)] = propositionalized[row_index, col_index] results_dict['data'].append(instance) for col_index in range(n): results_dict['schema']['fields'].append({ 'name': 'feature_%i' % (col_index + 1), 'type': 'float' }) io_helper.save_results(json.dumps(results_dict), 'text/plain')
def intermediate_stats(): """Calculate summary statistics for single node.""" # Read inputs logging.info("Fetching data...") inputs = io_helper.fetch_data() dep_var = inputs["data"]["dependent"][0] indep_vars = inputs["data"]["independent"] labels = _get_labels(indep_vars + [dep_var]) types = _get_types(indep_vars + [dep_var]) if len(dep_var['series']) == 0: logging.warning( 'Dependent variable has no values, check your SQL query.') # Load data into a Pandas dataframe logging.info("Loading data...") df = io_helper.fetch_dataframe(variables=[dep_var] + indep_vars) # Generate results logging.info("Generating results...") group_variables = [ var['name'] for var in indep_vars if utils.is_nominal(var) ] # grouped statistics data = [] if group_variables: for group_name, group in df.groupby(group_variables): # if there's only one nominal column if not isinstance(group_name, tuple): group_name = (group_name, ) data += _calc_stats(group, group_name, group_variables, labels, types) # overall statistics data += _calc_stats(df, ('all', ), [], labels, types) logging.info("Results:\n{}".format(data)) table = { 'schema': OUTPUT_SCHEMA_INTERMEDIATE, 'data': data, } io_helper.save_results(pd.io.json.dumps(table), shapes.Shapes.TABULAR_DATA_RESOURCE) logging.info("DONE")
def main(): """Calculate histogram of dependent variable in a single-node mode and return output in highcharts JSON.""" try: # Read inputs inputs = io_helper.fetch_data() try: dep_var = inputs["data"]["dependent"][0] except KeyError: logging.warning("Cannot find dependent variables data") dep_var = [] try: indep_vars = inputs["data"]["independent"] except KeyError: logging.warning("Cannot find independent variables data") indep_vars = [] nb_bins = parameters.get_param(BINS_PARAM, int, DEFAULT_BINS) # Compute histograms (JSON formatted for HighCharts) histograms_results = compute_histograms(dep_var, indep_vars, nb_bins) if not INCLUDE_NO_DATA: histograms_results = [ _remove_no_data(hist) for hist in histograms_results ] # Store results io_helper.save_results(json.dumps(histograms_results), shapes.Shapes.HIGHCHARTS) except errors.UserError as e: logging.error(e) strict = parameters.get_boolean_param(STRICT_PARAM, DEFAULT_STRICT) if strict: # Will be handled by catch_user_error raise e else: # Display something to the user and then exit histograms_results = error_histograms(dep_var, indep_vars) io_helper.save_results(histograms_results, shapes.Shapes.HIGHCHARTS) utils.exit_on_error()
def main(): # Configure logging logging.basicConfig(level=logging.INFO) # Read inputs inputs = io_helper.fetch_data() dep_var = inputs["data"]["dependent"][0] inped_vars = inputs["data"]["independent"] design = get_parameter(inputs["parameters"], DESIGN_PARAM) # Check dependent variable type (should be continuous) if dep_var["type"]["name"] not in ["integer", "real"]: logging.warning("Dependent variable should be continuous !") return None # Extract data and parameters from inputs data = format_data(inputs["data"]) # Compute anova and generate PFA output anova_results = format_output(compute_anova(dep_var, inped_vars, data, design).to_dict()) # Store results io_helper.save_results(anova_results, Shapes.JSON)
def aggregate(job_ids): """Get partial regression coefficients together with covaraince matrix from all nodes and combine them into single estimate. :input job_ids: list of job_ids with intermediate results """ # Read inputs inputs = io_helper.fetch_data() indep_vars = inputs["data"]["independent"] # Read intermediate inputs from jobs logging.info("Fetching intermediate data...") results = _load_intermediate_data(job_ids) # Pool results result = _combine_estimates(results) # Generate PFA from coefficients featurizer = _create_featurizer(indep_vars) pfa = _generate_pfa_regressor(result, indep_vars, featurizer) # Save job_result logging.info('Saving PFA to job_results table...') io_helper.save_results(json.dumps(result), shapes.Shapes.PFA)
def main(): # Configure logging logging.basicConfig(level=logging.INFO) # Read inputs inputs = io_helper.fetch_data() dep_var = inputs["data"]["dependent"][0] inped_vars = inputs["data"]["independent"] # Check dependent variable type (should be continuous) if dep_var["type"]["name"] not in ["integer", "real"]: logging.warning("Dependent variable should be continuous !") return None # Extract data and parameters from inputs data = format_data(inputs["data"]) # Compute linear-regression and generate PFA output linear_regression_results = format_output( compute_linear_regression(dep_var, inped_vars, data)) # Store results io_helper.save_results(linear_regression_results, 'application/json')
def test_fetch_data(mock_read_sql_query): data = pd.DataFrame({ 'lefthippocampus': [1., 2.], 'subjectageyears': [20, 30], }) mock_read_sql_query.return_value = data with mock_engine(): inputs = fetch_data() assert inputs == { 'data': { 'dependent': [{ 'label': 'lefthippocampus', 'maxValue': 5.0, 'mean': 3.0, 'minValue': 0.1, 'name': 'lefthippocampus', 'series': [1.0, 2.0], 'std': 0.35, 'type': { 'name': 'real' } }], 'independent': [{ 'label': 'Age Years', 'maxValue': 130.0, 'minValue': 0.0, 'name': 'subjectageyears', 'series': [20, 30], 'type': { 'name': 'integer' } }] }, 'parameters': [] }
def main(clean_files=False): """ :param clean_files: if True, clean files afterwards """ # Read inputs inputs = io_helper.fetch_data() data = inputs["data"] beam = parameters.get_parameter('beam', int, 10) support = parameters.get_parameter('support', float, '0.00001') out_file = 'input.csv' rules_out_file = 'rules.txt' matrix, attributes = preprocess.to_matrix(data) preprocess.dump_to_csv(matrix, attributes, out_file) # Call hedwig with sensible defaults examples_file = out_file empty_bk = tempfile.mkdtemp() call([ 'python', '-m' 'hedwig.__main__', empty_bk, examples_file, '--beam', str(beam), '--support', str(support), '-f', 'csv', '-l', '-o', rules_out_file, '--nocache' ]) with open(rules_out_file) as f: results = f.read() if clean_files: os.remove(out_file) os.remove(rules_out_file) io_helper.save_results(results.replace('less_than', '<'), shapes.Shapes.TEXT)
import tempfile import logging from subprocess import call from mip_helper import io_helper, parameters import preprocess DEFAULT_DOCKER_IMAGE = 'python-jsi-hedwig' if __name__ == '__main__': # Configure logging logging.basicConfig(level=logging.INFO) # Read inputs inputs = io_helper.fetch_data() data = inputs["data"] beam = parameters.get_param('beam', int, 10) support = parameters.get_param('support', float, '0.00001') out_file = 'input.csv' rules_out_file = 'rules.txt' matrix, attributes = preprocess.to_matrix(data) preprocess.dump_to_csv(matrix, attributes, out_file) # Call hedwig with sensible defaults examples_file = out_file empty_bk = tempfile.mkdtemp() call([
def intermediate_kmeans(): """Calculate k-Means locally.""" # Read inputs logging.info("Fetching data...") inputs = io_helper.fetch_data() indep_vars = inputs["data"]["independent"] # Extract hyperparameters from ENV variables k = parameters.get_param('n_clusters', int, DEFAULT_N_CLUSTERS) # Load data into a Pandas dataframe logging.info("Loading data...") X = io_helper.fetch_dataframe(variables=indep_vars) # Return variables info, but remove actual data points results = {'indep_vars': []} for var in indep_vars: if var['type']['name'] in ('integer', 'real'): new_var = {k: v for k, v in var.items() if k != 'series'} mean, std = _get_moments(var) new_var['mean'] = mean new_var['std'] = std else: new_var = var results['indep_vars'].append(new_var) # Drop NaN values X = utils.remove_nulls(X, errors='ignore') if len(X) == 0: logging.warning("All data are NULL, returning empty centroids.") results['centroids'] = [] io_helper.save_results(json.dumps(results), shapes.Shapes.JSON) return # Generate results logging.info("Generating results...") # featurization featurizer = _create_featurizer(indep_vars) X = featurizer.transform(X) m, n = X.shape num_iter = 0 not_converged = True # Run k-Means locally # Have each site compute k initial clusters locally local_centroids = local.initialize_own_centroids(X, k) # Local Optimization Loop while not_converged: # Each local site computes its cluster cluster_labels = local.compute_clustering(X, local_centroids) if OPTIMIZATION == 'lloyd': # Computes its local mean if doing lloyd, and updates centroids local_means = local.compute_mean(X, cluster_labels, k) local_centroids, previous_centroids = local.mean_step( local_means, local_centroids) elif OPTIMIZATION == 'gradient': # Computes the local gradient if doing GD, and takes a GD step local_grad = local.compute_gradient(X, cluster_labels, local_centroids, LR) local_centroids, previous_centroids = local.gradient_step( local_grad, local_centroids) # Check local stopping conditions not_converged, local_delta = local.check_stopping( local_centroids, previous_centroids, EPSILON) num_iter += 1 logging.info("Single-Shot {} ; iter : {} delta : {}".format( OPTIMIZATION, num_iter, local_delta)) results['centroids'] = [lc.tolist() for lc in local_centroids] logging.info("Results:\n{}".format(results)) io_helper.save_results(json.dumps(results), shapes.Shapes.JSON) logging.info("DONE")
def main(job_id, generate_pfa): inputs = io_helper.fetch_data() dep_var = inputs["data"]["dependent"][0] indep_vars = inputs["data"]["independent"] if dep_var['type']['name'] in ('polynominal', 'binominal'): job_type = 'classification' else: job_type = 'regression' # Get existing results with partial model if they exist if job_id: job_result = io_helper.get_results(job_id=str(job_id)) logging.info('Loading existing estimator') estimator = deserialize_sklearn_estimator(json.loads(job_result.data)['estimator']) else: logging.info('Creating new estimator') estimator = _create_estimator(job_type) # featurization featurizer = _create_featurizer(indep_vars, estimator) # convert variables into dataframe X = io_helper.fetch_dataframe(variables=[dep_var] + indep_vars) X = utils.remove_nulls(X, errors='ignore') y = X.pop(dep_var['name']) X = featurizer.transform(X) if len(X) == 0: # log error, but still save the estimator logging.warning("All data are NULL, cannot fit model") else: # Train single step if hasattr(estimator, 'partial_fit'): if job_type == 'classification': estimator.partial_fit(X, y, classes=dep_var['type']['enumeration']) else: estimator.partial_fit(X, y) else: if not generate_pfa: logging.warning('{} does not support partial fit.'.format(estimator)) if isinstance(estimator, GradientBoostingClassifier) and len(set(y)) == 1: raise errors.UserError( 'All outputs have single category ({}), Gradient boosting cannot fit that.'.format(y.iloc[0]) ) estimator.fit(X, y) if generate_pfa: # Create PFA from the estimator types = [(var['name'], var['type']['name']) for var in indep_vars] # Estimator was not trained on any data if not _is_fitted(estimator): raise errors.UserError('Model was not fitted on any data, cannot generate PFA.') pfa = sklearn_to_pfa(estimator, types, featurizer.generate_pretty_pfa()) # Add serialized model as metadata pfa['metadata'] = _estimator_metadata(estimator, X, y, featurizer) model_type = parameters.get_parameter('type', str, 'linear_model') pfa['name'] = model_type # Save or update job_result logging.info('Saving PFA to job_results table') pfa = json.dumps(pfa) io_helper.save_results(pfa, shapes.Shapes.PFA) else: # Save or update job_result logging.info('Saving serialized estimator into job_results table') io_helper.save_results(json.dumps(_estimator_metadata(estimator, X, y, featurizer)), shapes.Shapes.JSON)
def main(): # Read inputs inputs = io_helper.fetch_data() dep_var = inputs["data"]["dependent"][0] indep_vars = inputs["data"]["independent"] if not indep_vars: raise errors.UserError('No covariables selected.') if utils.is_nominal(dep_var): job_type = 'classification' else: job_type = 'regression' data = io_helper.fetch_dataframe(variables=[dep_var] + indep_vars) data = utils.remove_nulls(data, errors='ignore') y = data.pop(dep_var['name']) featurizer = _create_featurizer(indep_vars) X = pd.DataFrame(featurizer.transform(data), columns=featurizer.columns) if X.empty: logging.warning('All values are NAN, returning zero values') result = {} pfa = None else: # Add intercept X.insert(loc=0, column='intercept', value=1.) # Remove linearly dependent columns X = X.iloc[:, _independent_columns(X)] # Fit regresssion if job_type == 'regression': result, metadata = _fit_regression(X, y) # Generate PFA for predictions pfa = _generate_pfa_regressor(result, indep_vars, featurizer) elif job_type == 'classification': # Run one-vs-others for each class result = {} metadata = {} for cat in y.cat.categories: r, m = _fit_logit(X, y == cat) result[cat] = r metadata[cat] = m if all(result[cat]['intercept']['coef'] is None for cat in y.cat.categories): raise errors.UserError( 'Not enough data to apply logistic regression.') # Generate PFA for predictions pfa = _generate_pfa_classifier(result, indep_vars, featurizer, y.cat.categories) # Add metadata from model pfa['metadata'] = metadata # TODO: save multiple outputs - PFA and coeficients # Store results io_helper.save_results(json.dumps(result), 'application/json')
def main(): logging.basicConfig(level=logging.INFO) inputs = io_helper.fetch_data() # Dependent variable for tsne this might be the labels - this is optional labels = None dependent = inputs["data"].get("dependent", []) indep_vars = inputs["data"]["independent"] # For tsne the data dimensions if not data_types_in_allowed(indep_vars, ["integer", "real"]): logging.warning("Independent variables should be continuous !") return None # data = format_independent_data(inputs["data"]) df = pd.DataFrame.from_dict(data) source_dimensions = df.shape[1] # number of columns num_points = df.shape[0] # number of samples/points convdf = df.apply(lambda x: pd.to_numeric(x)) # Write the data to a temporary file f = tempfile.NamedTemporaryFile(delete=False) input = convdf.values.astype(np.float32) logging.debug('input {}'.format(input)) # Get the parameters (optional) perplexity = 30 theta = 0.5 target_dimensions = 2 iterations = 1000 do_zscore = True dependent_is_label = True try: perplexity = get_parameter(inputs['parameters'], 'perplexity', perplexity) theta = get_parameter(inputs['parameters'], 'theta', theta) target_dimensions = get_parameter(inputs['parameters'], 'target_dimensions', target_dimensions) iterations = get_parameter(inputs['parameters'], 'iterations', iterations) do_zscore_str = get_parameter(inputs['parameters'], 'do_zscore', str(do_zscore)) if do_zscore_str == 'True': do_zscore = True elif do_zscore_str == 'False': do_zscore = False else: raise ValueError dependent_is_label_str = get_parameter(inputs['parameters'], 'dependent_is_label', str(dependent_is_label)) if dependent_is_label_str == 'True': dependent_is_label = True elif dependent_is_label_str == 'False': dependent_is_label = False else: raise ValueError except ValueError as e: logging.error("Could not convert supplied parameter to value, error: ", e) raise except Exception: logging.error(" Unexpected error:", sys.exc_info()[0]) raise # Compute results if do_zscore: input = scipy.stats.zscore(input) if len(dependent) > 0 and dependent_is_label: dep_var = dependent[0] labels = dep_var["series"] input_file_path = f.name input.tofile(input_file_path) f.close() f = tempfile.NamedTemporaryFile(delete=False) output_file_path = f.name f.close() output = a_tsne(input_file_path, output_file_path, num_points, source_dimensions, target_dimensions, perplexity, theta, iterations) logging.debug('output shape {}'.format(output.shape)) logging.debug('output {}'.format(output)) chart = generate_scatterchart(output, indep_vars, labels, perplexity, theta, iterations) logging.debug("Highchart: %s", chart) io_helper.save_results(chart, '', shapes.Shapes.HIGHCHARTS) logging.info("Highchart output saved to database.")