def economicactivity_info(engine, id_company): """ Function to print economic activity from taxes and inpsections tables engine: query id_company: rutempresamask """ qry = """set role direccion_trabajo_inspections_write; select gcae, ccae, count(*) from raw.inspections_complete where rutempresamask = '{}' group by gcae, ccae order by ccae;""".format(id_company) tab_ins = plsql.query(qry, engine) print('Economic activity from Inpsections data: \n') print(tab_ins) print('\n') qry = """set role direccion_trabajo_inspections_write; select actividadeconomica, count(*) from raw.taxes where rutmask = '{}' group by actividadeconomica order by count desc;""".format(id_company) tab_tax = plsql.query(qry, engine) print('Economic activity from Taxes data:\n') print(tab_tax)
def freq_matters_bookupdated(engine, role, id_company): """ Function to create frequency table of matters infracted engine: to query role: role for query id_company: rutempresamask """ qry = """set role {}; select case when trim(matter_code) = '-' then '' else trim(matter_code) end as matter, count(*) from cleaned.infracted_matters_updatedbook where rutempresamask = {} group by matter order by matter;""".format(role, "'" + id_company + "'") tab = plsql.query(qry, engine) qry = """set role {}; select case when trim(codigo) = '-' then '' else trim(codigo) end as matter, trim(lower(regexp_replace(statement , '\.', ''))) as state, count(*) from raw.tipificador group by matter, state;""".format(role) tab_tipi = plsql.query(qry, engine) tab_join = tab.merge(tab_tipi, left_on='matter', right_on='matter', how='inner') tab_result = tab_join.groupby('matter').apply(first_row).sort_values('count_x', ascending = False) return tab_result
def proportion_nulls_all_columns(engine, role, schema, table, na='__null__', ind=False): props = [] counts = [] qry = """ set role {}; select column_name from information_schema.columns where table_name = '{}' and table_schema = '{}';""".format(role, table, schema) result = plsql.query(qry, engine) for i, row in result.iterrows(): nulls = count_nulls(engine, role, schema, table, row.column_name, na, ind).values length = total_rows(engine, role, schema, table).values counts.append(float(nulls)) props.append(float(nulls / length)) result['count'] = counts result['proportion'] = props return result
def model_id_graphs(model_id, role, engine, path): """ Running and save graphs for every model """ print('Creating graphs.......') qry = """set role {}; select * from results.models where model_id = {}; """.format(role, model_id) tab_name = plsql.query(qry, engine) model_group_id = tab_name.model_group_id[0] model_type = tab_name.model_type[0] fig_name = ( "group_{group_model_id}_modelid_{model_id}_{model_type}.png".format( group_model_id=tab_name.model_group_id[0], model_id=model_id, model_type=tab_name.model_type[0])).lower() fig_path = path gg_hist_all, gg_hist_lab = model_id_score_gghist(model_id, model_group_id, model_type, role, engine) gg_hist_all.save(filename=fig_name, path=fig_path + 'histogram_all/') gg_hist_lab.save(filename=fig_name, path=fig_path + 'histogram_label/') gg_prec = model_id_precsion_recall(model_id, role, engine) gg_prec.save(filename=fig_name, path=fig_path + 'precision/') gg_impo = model_id_feature_importance(model_id, role, engine) gg_impo.save(filename=fig_name, path=fig_path + 'features/')
def facility_info(engine, id_company): """ Function to print company's general information engine: query id_company: rutempresamask """ qry = """set role direccion_trabajo_inspections_write; select * from raw.inspections_complete where rutempresamask = {};""".format("'" + id_company + "'") tab_emp = plsql.query(qry, engine) total = tab_emp.idfiscalizacion.count() print('Total inspections:', tab_emp.idfiscalizacion.count()) print('Total inspections with violations:', tab_emp.infractor.sum()) print('% inspections with violations:', round(100*tab_emp.infractor.sum()/total) ) print("\n") print('Total proactive inspections:', sum(tab_emp["solicitante"] == 'Por Programa')) print('Total reactive inspections:', sum(tab_emp["solicitante"] != 'Por Programa')) print('% proactive inspections:', round(100*sum(tab_emp["solicitante"] == 'Por Programa')/total) ) print("\n") print('Total urgent inspections:', tab_emp.urgencia.sum()) print('% urgent inspections:', round(100*tab_emp.urgencia.sum()/total) ) print("\n") print('Total matters inspected:', tab_emp.num_materias.sum().astype('int')) print('Total matters infractioned:', tab_emp.infra.sum().astype('int')) print('% matters infractioned:', round(100*tab_emp.infra.sum().astype('int')/tab_emp.num_materias.sum().astype('int')) )
def rows_by_group(engine, role, schema, table, group): qry = """set role {}; select {}, count(*) from {}.{} group by {} order by {}; """ result = plsql.query(qry.format(role, group, schema, table, group, group), engine) return result
def get_table(engine, y, X, start_month, end_month, label_query, nulls=True, sample=False, sample_level=.8): # Test data: include nulls if nulls == True: if nulls == True: na = "" # Training data: don't include nulls else: na = " where {} is not null".format(y) # If undersampling data to correct imbalance if sample == True: samp = "random_system_not_feature > {} or {} is true".format( sample_level, y) if nulls == False: samp = " and " + samp else: samp = " where " + samp else: samp = "" # Get strings for correct table name table_start = start_month.strftime('%Y-%m-%d').replace('-', '') table_end = end_month.strftime('%Y-%m-%d').replace('-', '') # Load in the model matrix lab_qry = label_query.format(table_start=table_start, table_end=table_end) qry = """set role direccion_trabajo_inspections_write; with label as ({lab_qry}) select t.entity_id, t.month_year, {features}, {label} from staging.train_monthly_split_{table_start}_{table_end} t left join label l on t.entity_id = l.entity_id and t.month_year = l.month_year {na}{samp}; """.format(lab_qry=lab_qry, features=', '.join(X), label=y, table_start=table_start, table_end=table_end, na=na, label_query=label_query, samp=samp) df = plsql.query(qry, engine) return df
def check_table(engine, start_date, end_date, drop_table=False): table_start = start_date.strftime('%Y-%m-%d').replace('-', '') table_end = end_date.strftime('%Y-%m-%d').replace('-', '') # Check for existence of staging table for TRAINING time split df = plsql.query( """set role direccion_trabajo_inspections_write; select table_name from information_schema.tables where table_schema = 'staging' and table_name = 'train_monthly_split_{}_{}'; """.format(table_start, table_end), engine) # If doesn't exist or config says to drop table, create new table (TRAINING) if df.shape[0] == 0 or drop_table == True: utab.create_table(engine, start_date, end_date)
def count_nulls(engine, role, schema, table, column, na='__null__', ind=False): qry = """set role {}; select sum(case when {} {} then 1 else 0 end) from {}.{}; """ ##### Create NA condition for query # If NA value is NULL, do not pass any value for "na" parameter # (this is the default case) if na == '__null__': na_qry = 'is null' # If NA value is something other than NULL, pass value # with an indicator for whether this value should be treated as a string else: if ind == False: na = "'{}'".format(na) na_qry = '= ' + na result = plsql.query(qry.format(role, column, na_qry, schema, table), engine) return result
def model_id_score_gghist(model_id, model_group_id, model_type, role, engine): """ This function creates de scores histogram Parameters: model_id = model id to plot group_model_id role engine Return histogram """ qry = """set role {}; select * from results.predictions where model_id = {}; """.format(role, model_id) df = plsql.query(engine=engine, qry=qry) model_type = model_type model_id = model_id month_start = df.as_of_date[0] gg_hist_label = ( ggplot(df, aes(x='score')) + geom_histogram(bins=25) + ggtitle('{model_type}; model_id = {model_id},\n{month_start} '.format( model_type=model_type, model_id=model_id, month_start=month_start)) + facet_wrap('~label_value', scales='free_y') + theme(figure_size=(5, 3))) gg_hist_all = ( ggplot(df, aes(x='score')) + geom_histogram(bins=25) + ggtitle('{model_type}; model_id = {model_id},\n{month_start} '.format( model_type=model_type, model_id=model_id, month_start=month_start)) + theme(figure_size=(4, 3))) return gg_hist_all, gg_hist_label
def model_id_precsion_recall(model_id, role, engine): qry = """set role {}; select * from (select model_id, model_type, value, evaluation_start_time, evaluation_end_time, prediction_frequency, split_part(metric, '|', 1) AS metric_name, split_part(metric, '|', 2) AS null_param, split_part(metric, '|', 3) AS cutoff, split_part(metric, '|', 4) AS type, split_part(metric, '|', 5) AS subset from results.evaluations a left join (select model_id, model_group_id, model_type from results.models) b using (model_id) where a.model_id = {}) s where s.subset = 'all_data' and s.type = 'pct'; """.format(role, model_id) df = plsql.query(engine=engine, qry=qry) tab = (df[df['metric_name'].isin(['precision', 'recall', 'fallout'])]).reset_index() tab.cutoff = tab.cutoff.astype('float') gg_precrecall = (ggplot( tab, aes(x='cutoff', y='value', color='metric_name', group='metric_name')) + geom_line(size=1) + xlab('Population (Prop.)') + ylab('Value') + facet_wrap('~null_param') + ggtitle( '{model_type}; model_id = {model_id},\n{month_start} ' .format(model_type=df.model_type[0], model_id=df.model_id[0], month_start=df.evaluation_start_time[0])) + theme(figure_size=(7, 3))) return gg_precrecall
def model_id_feature_importance(model_id, role, engine): qry = """set role {}; select * from results.feature_importances a left join (select model_id, model_group_id, model_type from results.models) b using (model_id) where a.model_id = {}; """.format(role, model_id) df = plsql.query(engine=engine, qry=qry) df_sorted = df.sort_values(by='feature_importance', ascending=True) df_sorted = df_sorted.nlargest(20, 'feature_importance') df_sorted['feature_clean'] = df_sorted.feature.str.replace( "_", " ").str.title() ## order the features order_features = list(df_sorted.feature_clean) ## create categorical version for graph df_sorted['feature_categorical'] = df_sorted.feature_clean.astype( 'category', ordered=True, categories=order_features) gg_importance = ( ggplot(df_sorted, aes(x='feature_categorical', y='feature_importance')) + geom_bar(stat='identity', fill='#1f497d', alpha=0.2, color='black') + xlab('Feature') + ylab('Importance') + ggtitle('{model_type}; model_id = {model_id} '.format( model_type=df.model_type[0], model_id=df.model_id[0])) + theme(axis_text_x=element_text(size=7, angle=90), figure_size=(7, 3))) return gg_importance
def count_distinct(engine, role, schema, table, column): qry = """set role {}; select count(distinct {}) from {}.{}; """ result = plsql.query(qry.format(role, column, schema, table), engine) return result
def total_rows(engine, role, schema, table): qry = """set role {}; select count(*) from {}.{}; """ result = plsql.query(qry.format(role, schema, table), engine) return result