def _create_histogram(self, M_c, data, columns, mc_col_indices, filename): dir = S.path.web_resources_data_dir full_filename = os.path.join(dir, filename) num_rows = data.shape[0] num_cols = data.shape[1] # pylab.figure() # col_i goes from 0 to number of predicted columns # mc_col_idx is the original column's index in M_c for col_i in range(num_cols): mc_col_idx = mc_col_indices[col_i] data_i = data[:, col_i] ax = pylab.subplot(1, num_cols, col_i, title=columns[col_i]) if M_c['column_metadata'][mc_col_idx][ 'modeltype'] == 'normal_inverse_gamma': pylab.hist(data_i, orientation='horizontal') else: str_data = [ du.convert_code_to_value(M_c, mc_col_idx, code) for code in data_i ] unique_labels = list(set(str_data)) np_str_data = numpy.array(str_data) counts = [] for label in unique_labels: counts.append(sum(np_str_data == label)) num_vals = len( M_c['column_metadata'][mc_col_idx]['code_to_value']) rects = pylab.barh(range(num_vals), counts) heights = numpy.array([rect.get_height() for rect in rects]) ax.set_yticks(numpy.arange(num_vals) + heights / 2) ax.set_yticklabels(unique_labels) pylab.tight_layout() pylab.savefig(full_filename)
def convert_row(row, M_c): """ Helper function to convert a row from its 'code' (as it's stored in T) to its 'value' (the human-understandable value). """ ret = [] for cidx, code in enumerate(row): if not numpy.isnan(code) and not code=='nan': ret.append(du.convert_code_to_value(M_c, cidx, code)) else: ret.append(code) return tuple(ret)
def convert_row(row, M_c): """ Helper function to convert a row from its 'code' (as it's stored in T) to its 'value' (the human-understandable value). """ ret = [] for cidx, code in enumerate(row): if not numpy.isnan(code) and not code == 'nan': ret.append(du.convert_code_to_value(M_c, cidx, code)) else: ret.append(code) return tuple(ret)
def simulate(self, tablename, columnstring, newtablename, whereclause, numpredictions, order_by): """Simple predictive samples. Returns one row per prediction, with all the given and predicted variables.""" X_L_list, X_D_list, M_c = self.persistence_layer.get_latent_states( tablename) M_c, M_r, T = self.persistence_layer.get_metadata_and_table(tablename) numrows = len(M_r['idx_to_name']) name_to_idx = M_c['name_to_idx'] # parse whereclause where_col_idxs_to_vals = dict() if whereclause == "" or '=' not in whereclause: Y = None else: varlist = [[c.strip() for c in b.split('=')] for b in whereclause.split('AND')] Y = [] for colname, colval in varlist: if type(colval) == str or type(colval) == unicode: colval = ast.literal_eval(colval) where_col_idxs_to_vals[name_to_idx[colname]] = colval Y.append((numrows + 1, name_to_idx[colname], colval)) # map values to codes Y = [(r, c, du.convert_value_to_code(M_c, c, colval)) for r, c, colval in Y] ## Parse queried columns. colnames = [colname.strip() for colname in columnstring.split(',')] col_indices = [name_to_idx[colname] for colname in colnames] query_col_indices = [ idx for idx in col_indices if idx not in where_col_idxs_to_vals.keys() ] Q = [(numrows + 1, col_idx) for col_idx in query_col_indices] args_dict = dict() args_dict['M_c'] = M_c args_dict['X_L'] = X_L_list args_dict['X_D'] = X_D_list args_dict['Y'] = Y args_dict['Q'] = Q args_dict['n'] = numpredictions out = self.backend.simple_predictive_sample(M_c, X_L_list, X_D_list, Y, Q, numpredictions) # convert to data, columns dict output format # map codes to original values ## TODO: Add histogram call back in, but on Python client locally! #self._create_histogram(M_c, numpy.array(out), columns, col_indices, tablename+'_histogram') data = [] for vals in out: row = [] i = 0 for idx in col_indices: if idx in where_col_idxs_to_vals: row.append(where_col_idxs_to_vals[idx]) else: row.append(du.convert_code_to_value(M_c, idx, vals[i])) i += 1 data.append(row) ret = {'message': 'Simulated data:', 'columns': colnames, 'data': data} return ret
def infer(self, tablename, columnstring, newtablename, confidence, whereclause, limit, numsamples, order_by=False): """Impute missing values. Sample INFER: INFER columnstring FROM tablename WHERE whereclause WITH confidence LIMIT limit; Sample INFER INTO: INFER columnstring FROM tablename WHERE whereclause WITH confidence INTO newtablename LIMIT limit; Argument newtablename == null/emptystring if we don't want to do INTO """ # TODO: actually impute only missing values, instead of all values. X_L_list, X_D_list, M_c = self.persistence_layer.get_latent_states( tablename) M_c, M_r, T = self.persistence_layer.get_metadata_and_table(tablename) numrows = len(T) t_array = numpy.array(T, dtype=float) name_to_idx = M_c['name_to_idx'] if '*' in columnstring: col_indices = name_to_idx.values() else: colnames = [colname.strip() for colname in columnstring.split(',')] col_indices = [name_to_idx[colname] for colname in colnames] Q = [] for row_idx in range(numrows): for col_idx in col_indices: if numpy.isnan(t_array[row_idx, col_idx]): Q.append([row_idx, col_idx]) # FIXME: the purpose of the whereclause is to specify 'given' # p(missing_value | X_L, X_D, whereclause) ## TODO: should all observed values besides the ones being imputed be givens? if whereclause == "" or '=' not in whereclause: Y = None else: varlist = [[c.strip() for c in b.split('=')] for b in whereclause.split('AND')] Y = [(numrows + 1, name_to_idx[colname], colval) for colname, colval in varlist] Y = [(r, c, du.convert_value_to_code(M_c, c, colval)) for r, c, colval in Y] # Call backend args_dict = dict() args_dict['M_c'] = M_c args_dict['X_L'] = X_L_list args_dict['X_D'] = X_D_list args_dict['Y'] = Y # givens args_dict['n'] = numsamples counter = 0 ret = [] for q in Q: args_dict['Q'] = q # querys out = self.backend.impute_and_confidence(M_c, X_L_list, X_D_list, Y, [q], numsamples) value, conf = out if conf >= confidence: row_idx = q[0] col_idx = q[1] ret.append((row_idx, col_idx, value)) counter += 1 if counter >= limit: break imputations_list = [(r, c, du.convert_code_to_value(M_c, c, code)) for r, c, code in ret] ## Convert into dict with r,c keys imputations_dict = defaultdict(dict) for r, c, val in imputations_list: imputations_dict[r][c] = val ret = self.select(tablename, columnstring, whereclause, limit, order_by=order_by, imputations_dict=imputations_dict) return ret