def get_multiplier_table(): """ By mkiang for aggregating acs-features to station-level """ import pandas as pd import utils.pg_tools as pg pgw = pg.PGWrangler(dbitems=pg.get_pgdict_from_cfg()) conn = pgw.get_conn() multiplier = pd.read_sql_query( 'select a.battalion as station_name, b.geoid, ' 'st_area(st_intersection(a.geom, b.geom)) / ' 'st_area(b.geom) as multiplier ' 'from tiger_shp.tl_2013_39_bg as b ' 'join luigi_raw_shp.cad_station_areas as a on ' 'st_intersects(a.geom, b.geom)', conn) conn.close() multiplier.set_index('geoid', inplace=True, drop=True) return multiplier
def make_default_profile(filename, state, state_abbrev): postgres_dict = pg_tools.get_pgdict_from_cfg() with open(filename, 'wb') as f: for k, v in postgres_dict.iteritems(): f.write("{}={}\n".format(k, v)) f.write("STATE={}\n".format(state)) f.write("STATE_ABBREV={}\n".format(state_abbrev))
def make_json_df(m_table='model.lookup', k_values=[5, 30, 60, 90], return_raw=False, subset='2016-08-17'): """Just a wrapper to get the json info out of postgres """ ## Make a connection pgw = pg.PGWrangler(dbitems=pg.get_pgdict_from_cfg()) conn = pgw.get_conn() ## Download the raw data -- clean up json column raw_data = pd.read_sql_query('select * from {}'.format(m_table), conn) raw_data.columns = raw_data.columns.str.lower() raw_data.json = raw_data.json.apply(json.loads) if subset: raw_data = raw_data[raw_data.timestamp >= subset].copy() raw_data.reset_index(inplace=True, drop=True) # Helpful for debugging if return_raw: return raw_data ## Extract and normalize the json from raw data -- reindex json_df = pd.concat([ pd.io.json.json_normalize(row[1]) for row in raw_data.json.iteritems() ]) json_df.reset_index(inplace=True, drop=True) ## Clean up column names json_df.columns = json_df.columns.str.lower() json_df.columns = json_df.columns.str.replace(' ', '_') json_df.columns = json_df.columns.str.replace('&_', '') ## Make 2 columns for feature and importance, listed alphabetically json_df['features_list_alpha'] = json_df.model_feats_imps.apply( lambda x: [y[0] for y in x]) json_df['importance_list_alpha'] = json_df.model_feats_imps.apply( lambda x: [y[1] for y in x]) ## Convert this list of lists into a dictionary -- easier handling json_df.model_feats_imps = json_df.model_feats_imps.apply( lambda x: {y[0]: y[1] for y in x}) ## Now create a new features/importance colunm, sorted by abs(importance) json_df['features_sorted'] = json_df.model_feats_imps.apply( lambda x: sorted(x.items(), key=lambda (k, v): abs(v), reverse=True)) ## Break out the precision, recall, AUC, weighted score for i, k in enumerate(k_values): json_df['p_' + str(k)] = json_df.model_metrics.apply(lambda x: x[i][1][0]) json_df['r_' + str(k)] = json_df.model_metrics.apply(lambda x: x[i][1][1]) json_df['auc_' + str(k)] = json_df.model_metrics.apply(lambda x: x[i][1][2]) json_df['wsc_' + str(k)] = json_df.model_metrics.apply(lambda x: x[i][1][3]) ## Hash the model and feature strings to make grouping easier json_df['feature_hash'] = json_df.features_list_alpha.apply( lambda x: hashlib.md5(' '.join(x)).hexdigest()) json_df['model_hash'] = json_df.model_spec.apply( lambda x: hashlib.md5(' '.join(x)).hexdigest()) ## change the weights and model spec to string to allow for grouping json_df['model_str'] = json_df.model_spec.apply(str) json_df['weights_str'] = json_df.weights.apply(str) json_df.model_str = json_df.model_str.str.replace('RandomForestClassifier', 'RF') json_df.model_str = json_df.model_str.str.replace('LogisticRegression', 'LR') json_df.model_str = json_df.model_str.str.replace('KNeighborsClassifier', 'KNN') json_df.model_str = json_df.model_str.str.replace('ExtraTreesClassifier', 'ET') json_df.model_str = json_df.model_str.str.replace('bootstrap=False, ', '') json_df.model_str = json_df.model_str.str.replace('class_weight=None, ', '') return json_df
name = model_name plt.title(name) plt.savefig(dir_name + name) # plt.show() #### START REAL STUFF ## THIS IS FOR THE TEST DATA if os.path.isfile('./all_new_model_scores.csv'): model_scores = pd.read_csv('./all_new_model_scores.csv') model_scores.set_index('Unnamed: 0', inplace=True, drop=True) model_scores.index.rename('incident', inplace=True) else: ## Get the truth pgw = pg.PGWrangler(dbitems=pg.get_pgdict_from_cfg()) conn = pgw.get_conn() model_scores = pd.read_sql_query( 'select incident, trns_to_hosp, code_type ' 'from semantic.master where time_year = 2015', conn) ## convert bool to int model_scores.trns_to_hosp = model_scores.trns_to_hosp + 0 ## create index and remap buckets model_scores.set_index('incident', inplace=True, drop=True) model_scores['bucket'] = model_scores.code_type.map(bucket_map).fillna( 'other') ## THIS IS FOR THE TRAIN DATA if os.path.isfile('./training_model_scores.csv'): train_scores = pd.read_csv('./training_model_scores.csv')