def upload_to_db(table, db): """Upload a csv to the cod database.""" #determine schema (uploading to cod or cancer) upload_path = _get_upload_path(table) sesh = get_session(db) inf = Infiles(table=table, schema='cod', session=sesh) inf.infile(path=upload_path, with_replace=False, commit=True)
def upload(self): """ Attempts to infile any .csv files found in the /{output_dir}/{version_id} directory to gbd.output_le_decomp_v{self.process_version_id}. Any .csv files not to be uploaded should be stashed in subfolders. """ if not self._process_version_id: raise RuntimeError("A process version must be created " "before results can be uploaded.") table_name = list( GBDProcessVersion(self._process_version_id, env=self.env).tables)[0] session = get_session(conn_def=self.conn_def) infiler = Infiles(table=table_name, schema=self._schema, session=session) print(f"Starting upload at {datetime.now().time()}") infiler.indir(path=self.write_dir, partial_commit=True, sort_files=True) session.close() print(f"Finished upload at {datetime.now().time()}")
def upload_to_table(file_list, table_name, env, raise_on_failure): if env == DBEnv.PROD: conn_def = _ConnDefs.PROD.value elif env == DBEnv.DEV: conn_def = _ConnDefs.DEV.value sesh = get_session(conn_def=conn_def, connectable=True) infiler = Infiles(table=table_name, schema="gbd", session=sesh) logger.info("Beginning infile, time = {}".format(time.time())) for f in file_list: try: logger.info("infiling {}, time = {}".format(f, time.time())) # summaries are saved with col mean, need to upload to table as # val infiler.infile(f, with_replace=False, commit=True, rename_cols={'mean': 'val'}) except Exception: if raise_on_failure: raise tb = traceback.format_exc() logger.error("failed to infile {} with exception {};" "Skipping to next file".format(f, tb)) logger.info("Done infile, time = {}".format(time.time()))
def upload_cod_summaries(directories, conn_def): sesh = get_session(conn_def) exec_query("set unique_checks= 0", sesh) inf = Infiles(table='output', schema='cod', session=sesh) for directory in directories: inf.indir(path=directory, with_replace=True, partial_commit=True, commit=True)
def upload(self, sm, process_vers, in_dir, with_replace=True): '''Args: sm: 'single' or 'multi', referring to which table to use process_vers: comes from the output of above in_dir: filepath of csvs to be uploaded conn_def: 'gbd' or 'gbd_test' ''' table = 'output_mmr_%s_year_v%s' % (sm, process_vers) sesh = get_session(self.conn_def) inf = Infiles(table=table, schema='gbd', session=sesh) inf.indir(path=in_dir, with_replace=with_replace, partial_commit=True, commit=True) activate_process_version(process_version_id=process_vers, session=sesh) print("Uploaded! You win at life.")
def load_component_location(self, component, location_id, year_type): table_tmp = getattr(self, year_type)[component] table = table_tmp.format(self.como_version.gbd_process_version_id) sesh = get_session("gbd") infiler = Infiles(table, "gbd", sesh) if year_type == "single_year": indir_glob = f"{self.como_version.como_dir}/summaries/{component}/*/{year_type}/{location_id}/*.csv" else: indir_glob = f"{self.como_version.como_dir}/summaries/{component}/*/{year_type}/{location_id}.csv" infiler.indir(path=indir_glob, commit=True, partial_commit=True, rename_cols={"mean": "val"}, no_raise=(IntegrityError))
def upload_gbd_summaries(process_version, conn_def, directories): if any('multi' in d for d in directories): table = 'output_cod_multi_year_v{pv}'.format(pv=process_version) else: table = 'output_cod_single_year_v{pv}'.format(pv=process_version) sesh = get_session(conn_def) inf = Infiles(table=table, schema='gbd', session=sesh) for directory in directories: print("Uploading from {}".format(directory)) inf.indir(path=directory, with_replace=True, partial_commit=True, commit=True) return "Uploaded"
def load_data(stagedir, conn, table_name): sesh = get_session(conn_def=conn) infiler = Infiles(table=table_name, schema='gbd', session=sesh) csv = '%s/%s.csv' % (stagedir, table_name) print('beginning infile') start_time = str(datetime.datetime.now().time()) infiler.infile(path=csv, commit=True) print('done with infiles at {} (started {})'.format( str(datetime.datetime.now().time()), start_time)) return None
def upload(sev_version_id, process_version_id, table_type): file_pattern = "FILEPATH/sev/{v}/summaries/{tt}_*.csv".format(v=sev_version_id, tt=table_type) data = [] for f in glob.glob(file_pattern): data.append(pd.read_csv(f)) data = pd.concat(data) data.fillna(0, inplace=True) id_cols = [c for c in data if c.endswith('_id')] data[id_cols] = data[id_cols].astype(int) table_name = "output_sev_{}_v{}".format(table_type, process_version_id.gbd_process_version_id) data_dir = "FILEPATH/sev/{v}/summaries/{tt}".format(v=sev_version_id, tt=table_type) os.mkdir(data_dir) if table_type == 'single_year': data = data[['measure_id', 'year_id', 'location_id', 'sex_id', 'age_group_id', 'rei_id', 'metric_id', 'val', 'upper', 'lower']].sort_values( ['measure_id', 'year_id', 'location_id', 'sex_id', 'age_group_id', 'rei_id', 'metric_id']) year_ids = data['year_id'].drop_duplicates().tolist() for yr in year_ids: file = os.path.join(data_dir, 'single_year_{}.csv'.format(int(yr))) data.loc[data['year_id'] == yr].to_csv(file, index=False) os.chmod(file, 0o775) else: data = data[['measure_id', 'year_start_id', 'year_end_id', 'location_id', 'sex_id', 'age_group_id', 'rei_id', 'metric_id', 'val', 'upper', 'lower']].sort_values( ['measure_id', 'year_start_id', 'year_end_id', 'location_id', 'sex_id', 'age_group_id', 'rei_id', 'metric_id']) year_ids = data[['year_start_id','year_end_id']].drop_duplicates().to_dict('split')['data'] for yr in year_ids: file = os.path.join(data_dir, 'multi_year_{}_{}.csv'.format(int(yr[0]),int(yr[1]))) data.loc[(data['year_start_id'] == yr[0]) & (data['year_end_id'] == yr[1])].to_csv(file, index=False) os.chmod(file, 0o775) sesh = get_session(conn_def='gbd', connectable=True) infiler = Infiles(table=table_name, schema='gbd', session=sesh) file_list = sorted(os.listdir(data_dir)) for f in file_list: print(f) infiler.infile(os.path.join(data_dir, f), with_replace=False, commit=True)
def load_data(stagedir, conn, table_name): ############################################################## ## Loads data into specified db using db_tools Infiles ## object ############################################################## sesh = get_session(conn_def=conn) infiler = Infiles(table=table_name, schema='gbd', session=sesh) csv = '%s/%s.csv' % (stagedir, table_name) print('beginning infile') start_time = str(datetime.datetime.now().time()) infiler.infile(path=csv, commit=True, with_replace=True) print('done with infiles at {} (started {})'.format( str(datetime.datetime.now().time()), start_time)) return None
def load_location(self, file_list): self.engine.engine.dispose() session = self.engine.create_session(scoped=True).session infiler = Infiles(self.table_name, self._schema, session) for file in file_list: try: start_time = time.time() infiler.infile(file, with_replace=False, commit=True, rename_cols={'mean': 'val'}) end_time = time.time() elapsed = end_time - start_time logger.info("Infiling file {} at time {}".format( file, elapsed)) except exc.IntegrityError: logger.info("Recieved IntegrityError for file {}, " "skipping infile at time {}".format( file, time.time())) except Exception as e: logger.info(e) raise session.close()
def upload_diagnostics(parent_dir): sesh = get_session('codcorrect') inf = Infiles(table='diagnostic', schema='codcorrect', session=sesh) inf.infile(path='{pd}/_temp/upload_diagnostics.csv'.format(pd=parent_dir), with_replace=True, commit=True)
def upload_diagnostics(parent_dir): sesh = get_session('codcorrect') inf = Infiles(table='diagnostic', schema='codcorrect', session=sesh) inf.infile(path='FILEPATH.csv', with_replace=True, commit=True)