def _load_intermediate_data(job_ids): jobs_data = [io_helper.get_results(job_id).data for job_id in job_ids] # chain all results together, ignore empty results data = list(itertools.chain(*[json.loads(d) for d in jobs_data if d])) if not data: raise errors.UserError('Intermediate jobs {} do not have any data.'.format(job_ids)) return data
def aggregate_stats(job_ids): """Get all partial statistics from all nodes and aggregate them. :input job_ids: list of job_ids with intermediate results """ # Read intermediate inputs from jobs logging.info("Fetching intermediate data...") results = [json.loads(io_helper.get_results(str(job_id)).data) for job_id in job_ids] corr, columns = _aggregate_results(results) _save_corr_heatmap(corr, columns)
def _load_intermediate_data(job_ids): jobs_data = [io_helper.get_results(job_id).data for job_id in job_ids] # chain all results together, ignore empty results data = list( itertools.chain(*[json.loads(d)['data'] for d in jobs_data if d])) if not data: raise errors.UserError( 'Intermediate jobs {} do not have any data.'.format(job_ids)) df = pd.DataFrame(data) df['group'] = df['group'].map(tuple) return df
def _load_intermediate_data(job_ids): data = [] for job_id in job_ids: job_result = io_helper.get_results(job_id) # log errors (e.g. about missing data), but do not reraise them if job_result.error: logging.warning(job_result.error) else: pfa = json.loads(job_result.data) data.append(pfa) if not data: raise errors.UserError('All jobs {} returned an error.'.format(job_ids)) return data
def aggregate_kmeans(job_ids): """Compute merging of clusters according to least merging error (e.g. smallest distance betweeen centroids) :input job_ids: list of job_ids with intermediate results """ # Read intermediate inputs from jobs logging.info("Fetching intermediate data...") data = [ json.loads(io_helper.get_results(str(job_id)).data) for job_id in job_ids ] local_centroids = [ np.array(x['centroids']) for x in data if x['centroids'] ] indep_vars = data[0]['indep_vars'] # Aggregate clusters remotely remote_centroids = remote.aggregate_clusters(local_centroids) logging.info("Centroids:\n{}".format(remote_centroids)) # Create fake KMeans estimator and assign it our centroids estimator = KMeans() estimator.cluster_centers_ = np.array(remote_centroids) # Generate PFA for kmeans and add centroids to metadata featurizer = _create_featurizer(indep_vars) types = [(var['name'], var['type']['name']) for var in indep_vars] pfa = sklearn_to_pfa(estimator, types, featurizer.generate_pretty_pfa()) # Add serialized model as metadata pfa['metadata'] = { 'centroids': json.dumps(np.array(remote_centroids).tolist()) } # Save or update job_result logging.info('Saving PFA to job_results table') pfa = json.dumps(pfa) io_helper.save_results(pfa, shapes.Shapes.PFA) logging.info("DONE")
def main(job_id, generate_pfa): inputs = io_helper.fetch_data() dep_var = inputs["data"]["dependent"][0] indep_vars = inputs["data"]["independent"] if dep_var['type']['name'] in ('polynominal', 'binominal'): job_type = 'classification' else: job_type = 'regression' # Get existing results with partial model if they exist if job_id: job_result = io_helper.get_results(job_id=str(job_id)) logging.info('Loading existing estimator') estimator = deserialize_sklearn_estimator(json.loads(job_result.data)['estimator']) else: logging.info('Creating new estimator') estimator = _create_estimator(job_type) # featurization featurizer = _create_featurizer(indep_vars, estimator) # convert variables into dataframe X = io_helper.fetch_dataframe(variables=[dep_var] + indep_vars) X = utils.remove_nulls(X, errors='ignore') y = X.pop(dep_var['name']) X = featurizer.transform(X) if len(X) == 0: # log error, but still save the estimator logging.warning("All data are NULL, cannot fit model") else: # Train single step if hasattr(estimator, 'partial_fit'): if job_type == 'classification': estimator.partial_fit(X, y, classes=dep_var['type']['enumeration']) else: estimator.partial_fit(X, y) else: if not generate_pfa: logging.warning('{} does not support partial fit.'.format(estimator)) if isinstance(estimator, GradientBoostingClassifier) and len(set(y)) == 1: raise errors.UserError( 'All outputs have single category ({}), Gradient boosting cannot fit that.'.format(y.iloc[0]) ) estimator.fit(X, y) if generate_pfa: # Create PFA from the estimator types = [(var['name'], var['type']['name']) for var in indep_vars] # Estimator was not trained on any data if not _is_fitted(estimator): raise errors.UserError('Model was not fitted on any data, cannot generate PFA.') pfa = sklearn_to_pfa(estimator, types, featurizer.generate_pretty_pfa()) # Add serialized model as metadata pfa['metadata'] = _estimator_metadata(estimator, X, y, featurizer) model_type = parameters.get_parameter('type', str, 'linear_model') pfa['name'] = model_type # Save or update job_result logging.info('Saving PFA to job_results table') pfa = json.dumps(pfa) io_helper.save_results(pfa, shapes.Shapes.PFA) else: # Save or update job_result logging.info('Saving serialized estimator into job_results table') io_helper.save_results(json.dumps(_estimator_metadata(estimator, X, y, featurizer)), shapes.Shapes.JSON)