def insert_new_sites(df, db): ''' Function will update cod.site by adding entries, and will only do so if the entry doesn't already exist. ''' # check for overlaps before uploading name_df = df[['site_name']].drop_duplicates().dropna() unique_names = name_df['site_name'].unique() names_clause = "','".join(unique_names) names_query = """ SELECT * FROM cod.site WHERE site_name IN ('{names_clause}') """.format(names_clause=names_clause) overlap = ez.query(names_query, conn_def=db) if len(overlap) > 0: raise AssertionError( "Conflicting site_name's already present: \n{overlap}".format( overlap=overlap) ) engine = ez.get_engine(db) conn = engine.connect() df.to_sql('site', conn, if_exists='append', index=False) conn.close() print("Uploaded new {name_col}s: \n{name_df}".format( name_col='site', name_df=df))
def unmark_current_best(self): now = datetime.now().strftime("%Y-%m-%d %H:%M:%S") q = """ UPDATE epi.output_version SET best_end='{be}', is_best=0 WHERE is_best=1""".format(be=now) eng = ezfuncs.get_engine(conn_def="como-epi") res = eng.execute(q) return res
def insert_extract_type(extract_type, conn_def='ADDRESS'): """Insert a new extract type.""" assert pull_extract_type_id(extract_type, conn_def=conn_def) is None, \ "Already exists" engine = ezfuncs.get_engine(conn_def) conn = engine.connect() conn.execute(""" INSERT INTO cod.claude_extract_type (extract_type) VALUES ("{}") """.format(extract_type)) conn.close()
def activate_sequela_set_version(sequela_set_version_id, gbd_round_id=GBD_ROUND_ID, validate=True, conn_def=None): if conn_def is not None: config.engine = get_engine(conn_def=conn_def) with session_scope() as session: activate = ActivateSequelaVersion(session, sequela_set_version_id, gbd_round_id) if validate: activate.validate_version() activate.activate_version()
def insert_source_id(source, conn_def='ADDRESS'): """Insert a new source_id.""" if pull_source_id(source, conn_def=conn_def) is None: engine = ezfuncs.get_engine(conn_def) conn = engine.connect() print('\nInserting new source to cod.source table') conn.execute(""" INSERT INTO cod.source (source_name) VALUES ("{}") """.format(source)) conn.close() else: print("Source already exists")
def get_engine(conn_def, env): '''Retrieve a SQLAlchemy engine. ''' conn_def = conn_def.lower().strip() if conn_def not in ['epi', 'cod']: raise ValueError("Expected epi or cod, got {}".format(conn_def)) env = env.lower().strip() if env not in ['prod', 'dev']: raise ValueError("Expected prod or dev, got {}".format(env)) # always read prod cod data if conn_def == 'cod': true_conn_def = 'cod' else: true_conn_def = "cascade-{}".format(env) eng = ezfuncs.get_engine(conn_def=true_conn_def) return eng
def create_gbd_process_version(self, gbd_round_id): q = """ CALL gbd.new_gbd_process_version ( {}, 1, 'Como run', 'fix epi.ov table to accept hash', NULL, NULL) """.format(gbd_round_id) eng = ezfuncs.get_engine(conn_def="como-gbd") res = eng.execute(q) row = res.fetchone() pv_meta = row[0] self.gbd_process_version_id = int(json.loads( pv_meta)[0]["gbd_process_version_id"]) q = """ INSERT INTO gbd.gbd_process_version_metadata (`gbd_process_version_id`, `metadata_type_id`, `val`) VALUES ({gpvid}, 4, '{cv}') """.format(gpvid=self.gbd_process_version_id, cv=self.como_version_id) eng.execute(q) return pv_meta
def mark_best(self, description=""): self.unmark_current_best() now = datetime.now().strftime("%Y-%m-%d %H:%M:%S") q = """ UPDATE epi.output_version SET best_start='{best_start}', best_end=NULL, is_best=1, best_description='{best_description}', best_user='******' WHERE output_version_id={ovid} """.format( best_start=now, best_description=description, bu=getpass.getuser(), ovid=self.como_version_id) eng = ezfuncs.get_engine(conn_def="como-epi") res = eng.execute(q) return res
def compute_global_ratios(year_id, drawcols): eng = ezfuncs.get_engine(conn_def="cod") ccv = pd.read_sql(""" SELECT output_version_id FROM cod.output_version WHERE code_version=5 AND is_best=1""", eng).squeeze() sg = SuperGopher({ 'file_pattern': '{measure_id}_{location_id}.h5', 'h5_tablename': 'draws'}, 'FILEPATH/{ccv}/draws'.format(ccv=ccv)) ylls = sg.content(location_id=1, year_id=year_id, sex_id=[1, 2], measure_id=4) ratios = [] for resid_cid, yldmap in rkey.groupby('input_cause_id'): # get the ylls these_ylls = ylls[ylls.cause_id == resid_cid] ratio_ylls = ylls[ylls.cause_id.isin(yldmap.ratio_cause_id.unique())] # aggregate the inputs to the appropriate level group_cols = ['age_group_id', 'year_id'] these_ylls = these_ylls.groupby(group_cols) these_ylls = these_ylls[drawcols].sum().mean(axis=1) ratio_ylls = ratio_ylls.groupby(group_cols) ratio_ylls = ratio_ylls[drawcols].sum().mean(axis=1) # compute the ratio ratio = these_ylls / ratio_ylls ratio = ratio.reset_index() ratio = ratio.replace(np.inf, 0) ratio = ratio.replace(np.NaN, 0) ratio["cause_id"] = resid_cid ratios.append(ratio) df = pd.concat(ratios) df_male = df.copy() df_male["sex_id"] = 1 df_female = df.copy() df_female["sex_id"] = 2 return df_male.append(df_female)
def upload_sequela_year_summaries(como_dir, process_id, location_id, measure_id): eng = ezfuncs.get_engine(conn_def="como-gbd") for tn in ['single_year', 'multi_year']: try: if tn == 'single_year': cols = ",".join([ 'location_id', 'year_id', 'age_group_id', 'sex_id', 'measure_id', 'metric_id', 'sequela_id', 'val', 'lower', 'upper' ]) elif tn == 'multi_year': cols = ",".join([ 'location_id', 'year_start_id', 'year_end_id', 'age_group_id', 'sex_id', 'measure_id', 'sequela_id', 'metric_id', 'val', 'lower', 'upper' ]) summdir = os.path.join(como_dir, 'summaries', "sequela") summary_file = os.path.join( summdir, "%s_%s_%s.csv" % (measure_id, location_id, tn)) ldstr = """ LOAD DATA INFILE '{sf}' INTO TABLE gbd.output_sequela_{tn}_v{pid} FIELDS TERMINATED BY "," OPTIONALLY ENCLOSED BY '"' LINES TERMINATED BY "\\n" IGNORE 1 LINES ({cols})""".format(sf=summary_file, pid=process_id, tn=tn, cols=cols) res = eng.execute(ldstr) print 'Uploaded %s %s %s %s' % (location_id, measure_id, tn) except Exception as e: print e res = None return res
def clear_prev_data_version_status(database, table): """Update the data_version table. """ date = make_db_datestamp() can_nid = utils.get_gbd_parameter('generic_cancer_nid') update_query = """ UPDATE cod.{tbl} SET status_end = "{dt}", status='0' WHERE nid={nid} AND status=1 """ #conn_string = cdb.create_connection_string('testcod') engine = get_engine(conn_def=table) conn = engine.connect() res = conn.execute(update_query.format( tbl=table, dt=date, nid=can_nid )) conn.close()
def insert_names(name_table, name_df, df_name_col=None, conn_def='ADDRESS'): """Insert the name in the df to cod.name_table table.""" # make sure the column name in the df matches that in the db name_tables_to_col_name = {'site': 'site_name', 'source': 'source_name'} assert name_table in name_tables_to_col_name.keys(), \ "Invalid name table: {}".format(name_table) name_col = name_tables_to_col_name[name_table] assert set(name_df.columns) == set([name_col]), \ "Pass a df with one column: '{}'. You gave a df with these " \ "columns: {}".format(name_col, name_df.columns) # verify that sources are ok to upload if name_table == "source": assert_is_valid_source(name_df[name_col].unique()) # restrict data to just that name_df = name_df[[name_col]].drop_duplicates().dropna() unique_names = name_df[name_col].unique() names_clause = "','".join(unique_names) names_query = """ SELECT * FROM cod.{name_table} WHERE {name_col} IN ('{names_clause}') """.format(name_table=name_table, name_col=name_col, names_clause=names_clause) overlap = ezfuncs.query(names_query, conn_def=conn_def) if len(overlap) > 0: raise AssertionError( "Conflicting {name_col}s already present: \n{overlap}".format( name_col=name_col, overlap=overlap)) engine = ezfuncs.get_engine(conn_def) conn = engine.connect() name_df.to_sql(name_table, conn, if_exists='append', index=False) conn.close() print("Uploaded new {name_col}s: \n{name_df}".format(name_col=name_col, name_df=name_df))
def upload_to_db(self, user_input=True): """uploads the package_id to the database Assumes input_data global contains meta data and weights for the given package id. Saves farthest_step, list of dataframes containing uploaded tables. this method ensures the correct order of operations for the upload. KEY ASSUMPTIONS - You have one cause group per target - You do not have any OR clauses in your weight group logic [OPEN CONNECTION] [BEGIN TRANSACTION] 1. add a new version using the given package id and description 2. add new target cause groups to the database based on the targets in the input data (WILL ASSUME ONE CAUSE GROUP PER TARGET) a. add the actual targets using a 1:1 mapping to the new cause group ids created in the database (essentially, each target gets a random cause group id) 3. add new weight groups to the database based on input data, associated with the version id in (1) in the database a. add new weight group logic sets that bridge between weight groups and weight group logic using version id from (1) to find weight group ids created in (3) (WILL ASSUME THERE ARE NO 'OR' LOGIC CLAUSES IN THE WEIGHT GROUP) b. add new weight group logic using version id from (1) to find weight group logic sets created in (3a) 4. map the input data to newly created weight group ids and cause group ids using the cause group names and the weight group names in the input data, and using the version id created in (1) to find both. Then upload to weights table. Do all of this with version id from (1) 5. mark version metadata so that new version is best [COMMIT CHANGES] [CLOSE CONNECTION] """ name_function_order = [ ('1', 'Versions table', self.prep_version, 'rdp_sharedpackageversion'), ('2', 'Cause groups table', self.prep_cause_groups, 'rdp_sharedcausegroup'), ('2a', 'Targets table', self.prep_targets, 'rdp_sharedtarget'), ('3', 'Weight groups table', self.prep_weight_groups, 'rdp_sharedwgtgroup'), ('3a', 'Weight logic set table', self.prep_weight_group_logic_set, 'rdp_sharedwgtgrouplogicset'), ('3b', 'Weight logic table', self.prep_weight_group_logic, 'rdp_sharedwgtgrouplogic'), ('4', 'Weights table', self.prep_weights, 'rdp_sharedwgt') ] # rows_expected = printExpectedRowAdditions(do_print=user_input) self.farthest_step = 'Nowhere' self.tables_uploaded = {} # start a transaction - if anything fails from this point out, # rollback all changes engine = get_engine(self.conn_def) conn = engine.connect() trans = conn.begin() try: for step_number, step_name, prep_function, table_name in \ name_function_order: print( "[{t}] ({no}): {nm}".format( t=str(datetime.now()), no=step_number, nm=step_name ) ) # prep the data to upload df = prep_function() # upload it to the given table name RegressionUploader.upload_table_to_db( df, table_name, conn ) if step_number == '1': self.new_version_id = self.get_new_pvid() print("VERSION ID: {}".format(self.new_version_id)) # add to appended tables self.tables_uploaded[step_number] = df self.farthest_step = step_name print 'uploading...' continue_upload = 'unknown' if user_input: while continue_upload != 'Y' and continue_upload != 'N': continue_upload = raw_input( 'Should the new version be accepted?' 'Check that everything looks right above [Y/N]' ) if continue_upload == 'Y': print( 'Ok. Setting the old version' 'to old and the new version {n} to' 'best status'.format(n=self.new_version_id) ) self.switch_best_flag(conn) self.farthest_step = 'Flag switch' elif continue_upload == 'N': print( 'Got it. Check that out and in the ' 'meantime I\'ll rollback everything ' 'that was just uploaded.' ) self.rollback_everything(engine) else: print( 'I\'m dumb and didnt understand your ' 'input of \'{u}\'. Press either \'Y\' ' 'or \'N\'. I\'ll keep asking \\ ' ' until either the end of ' 'time or you give me a good ' 'answer.'.format(u=continue_upload) ) else: # just check that # if rows uploaded equals rows expected if False: self.switch_best_flag(conn) else: # print(rows_expected) # print(rows_uploaded) print( 'rows uploaded didnt equal rows expected. ' 'rolling back.' ) self.rollback_everything(engine) raise trans.commit() conn.close() except Exception, e: trans.rollback() conn.close() print( "ROLLING BACK: Got an {et}: {m}".format(et=type(e), m=str(e)) ) self.rollback_everything(engine) raise(e)
def engine(self): if not self._engine: self._engine = get_engine(conn_def=self.conn_def, connectable=False) self._engine.pool_recycle = 40.0 return self._engine