def main(): # configure logging logging.basicConfig(level=logging.INFO) logging.info(cf_netSDM) # Read inputs data = io_helper.fetch_data()['data'] X = io_helper.fetch_dataframe(data['independent']) y = io_helper.fetch_dataframe(data['dependent']).iloc[:, 0] if len(X) >= 2000: logging.warning( 'HINMine runs in quadratic time, processing {} samples could be very slow.' .format(len(X))) normalize = parameters.get_param('normalize', bool, 'True') damping = parameters.get_param('damping', float, '0.85') if normalize: X = X.apply(lambda x: x / np.linalg.norm(x)) network = construct_adjacency_graph(range(len(X)), X.values, y.values) propositionalized = timecall(cf_netSDM.hinmine_propositionalize)( network, damping)['train_features']['data'] results_dict = _construct_results(propositionalized) io_helper.save_results(json.dumps(results_dict), shapes.Shapes.TABULAR_DATA_RESOURCE)
def main(): # configure logging logging.basicConfig(level=logging.INFO) logging.info(cf_netSDM) # Read inputs inputs = io_helper.fetch_data() data = inputs['data'] normalize = parameters.get_param('normalize', bool, 'True') damping = parameters.get_param('damping', float, '0.85') data_array = np.zeros( (len(data['independent'][0]['series']), len(data['independent']))) col_number = 0 row_number = 0 for var in data['independent']: for value in var['series']: data_array[row_number, col_number] = value row_number += 1 col_number += 1 row_number = 0 if normalize: for col_number in range(data_array.shape[1]): data_array[:, col_number] = data_array[:, col_number] / np.linalg.norm( data_array[:, col_number]) network = construct_adjacency_graph(range(data_array.shape[0]), data_array, data['dependent'][0]['series']) propositionalized = cf_netSDM.hinmine_propositionalize( network, damping)['train_features']['data'] results_dict = { 'profile': 'tabular-data-resource', 'name': 'hinmine-features', 'data': [], 'schema': { 'fields': [], 'primaryKey': 'id' } } n = propositionalized.shape[0] for row_index in range(n): instance = {"id": row_index} for col_index in range(n): instance["feature_%i" % (col_index + 1)] = propositionalized[row_index, col_index] results_dict['data'].append(instance) for col_index in range(n): results_dict['schema']['fields'].append({ 'name': 'feature_%i' % (col_index + 1), 'type': 'float' }) io_helper.save_results(json.dumps(results_dict), 'text/plain')
def compute(): """Create PFA for kNN.""" # Read intermediate inputs from jobs logging.info("Fetching intermediate data...") inputs = io_helper.fetch_data() indep_vars = inputs["data"]["independent"] # Extract hyperparameters from ENV variables k = parameters.get_param('n_clusters', int, DEFAULT_N_CLUSTERS) # featurization featurizer = _create_featurizer(indep_vars) # convert variables into dataframe X = io_helper.fetch_dataframe(variables=indep_vars) X = utils.remove_nulls(X, errors='ignore') X = featurizer.transform(X) estimator = KMeans(n_clusters=k) estimator.fit(X) # Generate PFA for kmeans types = [(var['name'], var['type']['name']) for var in indep_vars] pfa = sklearn_to_pfa(estimator, types, featurizer.generate_pretty_pfa()) # Add centroids as metadata pfa['metadata'] = { 'centroids': json.dumps(estimator.cluster_centers_.tolist()) } # Save or update job_result logging.info('Saving PFA to job_results table') io_helper.save_results(json.dumps(pfa), shapes.Shapes.PFA) logging.info("DONE")
def main(): """Calculate histogram of dependent variable in a single-node mode and return output in highcharts JSON.""" try: # Read inputs inputs = io_helper.fetch_data() try: dep_var = inputs["data"]["dependent"][0] except KeyError: logging.warning("Cannot find dependent variables data") dep_var = [] try: indep_vars = inputs["data"]["independent"] except KeyError: logging.warning("Cannot find independent variables data") indep_vars = [] nb_bins = parameters.get_param(BINS_PARAM, int, DEFAULT_BINS) # Compute histograms (JSON formatted for HighCharts) histograms_results = compute_histograms(dep_var, indep_vars, nb_bins) if not INCLUDE_NO_DATA: histograms_results = [ _remove_no_data(hist) for hist in histograms_results ] # Store results io_helper.save_results(json.dumps(histograms_results), shapes.Shapes.HIGHCHARTS) except errors.UserError as e: logging.error(e) strict = parameters.get_boolean_param(STRICT_PARAM, DEFAULT_STRICT) if strict: # Will be handled by catch_user_error raise e else: # Display something to the user and then exit histograms_results = error_histograms(dep_var, indep_vars) io_helper.save_results(histograms_results, shapes.Shapes.HIGHCHARTS) utils.exit_on_error()
def intermediate_kmeans(): """Calculate k-Means locally.""" # Read inputs logging.info("Fetching data...") inputs = io_helper.fetch_data() indep_vars = inputs["data"]["independent"] # Extract hyperparameters from ENV variables k = parameters.get_param('n_clusters', int, DEFAULT_N_CLUSTERS) # Load data into a Pandas dataframe logging.info("Loading data...") X = io_helper.fetch_dataframe(variables=indep_vars) # Return variables info, but remove actual data points results = {'indep_vars': []} for var in indep_vars: if var['type']['name'] in ('integer', 'real'): new_var = {k: v for k, v in var.items() if k != 'series'} mean, std = _get_moments(var) new_var['mean'] = mean new_var['std'] = std else: new_var = var results['indep_vars'].append(new_var) # Drop NaN values X = utils.remove_nulls(X, errors='ignore') if len(X) == 0: logging.warning("All data are NULL, returning empty centroids.") results['centroids'] = [] io_helper.save_results(json.dumps(results), shapes.Shapes.JSON) return # Generate results logging.info("Generating results...") # featurization featurizer = _create_featurizer(indep_vars) X = featurizer.transform(X) m, n = X.shape num_iter = 0 not_converged = True # Run k-Means locally # Have each site compute k initial clusters locally local_centroids = local.initialize_own_centroids(X, k) # Local Optimization Loop while not_converged: # Each local site computes its cluster cluster_labels = local.compute_clustering(X, local_centroids) if OPTIMIZATION == 'lloyd': # Computes its local mean if doing lloyd, and updates centroids local_means = local.compute_mean(X, cluster_labels, k) local_centroids, previous_centroids = local.mean_step( local_means, local_centroids) elif OPTIMIZATION == 'gradient': # Computes the local gradient if doing GD, and takes a GD step local_grad = local.compute_gradient(X, cluster_labels, local_centroids, LR) local_centroids, previous_centroids = local.gradient_step( local_grad, local_centroids) # Check local stopping conditions not_converged, local_delta = local.check_stopping( local_centroids, previous_centroids, EPSILON) num_iter += 1 logging.info("Single-Shot {} ; iter : {} delta : {}".format( OPTIMIZATION, num_iter, local_delta)) results['centroids'] = [lc.tolist() for lc in local_centroids] logging.info("Results:\n{}".format(results)) io_helper.save_results(json.dumps(results), shapes.Shapes.JSON) logging.info("DONE")
from subprocess import call from mip_helper import io_helper, parameters import preprocess DEFAULT_DOCKER_IMAGE = 'python-jsi-hedwig' if __name__ == '__main__': # Configure logging logging.basicConfig(level=logging.INFO) # Read inputs inputs = io_helper.fetch_data() data = inputs["data"] beam = parameters.get_param('beam', int, 10) support = parameters.get_param('support', float, '0.00001') out_file = 'input.csv' rules_out_file = 'rules.txt' matrix, attributes = preprocess.to_matrix(data) preprocess.dump_to_csv(matrix, attributes, out_file) # Call hedwig with sensible defaults examples_file = out_file empty_bk = tempfile.mkdtemp() call([ 'python', '-m' 'hedwig.__main__', empty_bk, examples_file, '--beam', str(beam), '--support',