def main(): """Execute Stuff""" print('Running admissions_ccs_ohe.py. This file performs One Hot Encoding for Admissions and Diagnoses with Higher Level categorization with CCS Codes') args = get_args() dbm = DBManager(db_url=args.db_url) df = perform_ohe() dbm.write_df_table( df, table_name='admissions_ccs_ohe', schema='datasets', if_exists='replace')
def main(): """Execute Stuff""" print('Running admissions_topic_scores.py') args = get_args() dbm = DBManager(db_url=args.db_url) print('Loading DataFrame') df = create_adm_topic_features() print('Successfully Loaded DataFrame now writing to DB!') dbm.write_df_table( df, table_name='admissions_topic_scores', schema='datasets', if_exists='replace')
def main(): """Execute Stuff""" print('Parsing Census datasets') args = get_args() dbm = DBManager(db_url=args.db_url) directory = 'C:/Users/User/Dropbox/Documents/Analytics/Analyses/SBA/data/' load_irs_data(dbm, directory)
def main(): """Execute Stuff""" print('Parsing FOIA datasets') args = get_args() dbm = DBManager(db_url=args.db_url) directory = '/Users/VincentLa/git/datasci-sba/src/data/sba/' load_sba_datasets(dbm, directory)
def main(): """Execute Stuff""" print('Parsing and Loading SF Data Campaign Finance Datasets') args = get_args() dbm = DBManager(db_url=args.db_url) git_root_dir = uf.get_git_root(os.path.dirname(__file__)) directory = os.path.join(git_root_dir, 'src') load_datasets(dbm, directory)
def main(): """Execute Stuff""" print('Cleaning California Statewide Election Results Data') args = get_args() dbm = DBManager(db_url=args.db_url) git_root_dir = uf.get_git_root(os.path.dirname(__file__)) directory = os.path.join(git_root_dir, 'src', 'casos') clean_datasets(dbm, directory)
def main(): """Execute Stuff""" print('Parsing and Loading MapLight California Bulk Data Sets') args = get_args() dbm = DBManager(db_url=args.db_url) git_root_dir = uf.get_git_root(os.path.dirname(__file__)) directory = os.path.join(git_root_dir, 'src', 'maplight') load_datasets(dbm, directory)
def main(): """Execute Stuff""" print('Parsing IRS datasets') args = get_args() dbm = DBManager(db_url=args.db_url) git_root_dir = uf.get_git_root(os.path.dirname(__file__)) directory = os.path.join(git_root_dir, 'src', 'data', 'irs') load_irs_data(dbm, directory)
def main(): """Execute Stuff""" print('Parsing and Loading California Local Election Results Data') args = get_args() dbm = DBManager(db_url=args.db_url) git_root_dir = uf.get_git_root(os.path.dirname(__file__)) directory = os.path.join(git_root_dir, 'src', 'ceda') load_datasets(dbm, directory)
def main(): """Execute Stuff""" print('Parsing and Loading Kaggle Competition Datasets') args = get_args() dbm = DBManager(db_url=args.db_url) git_root_dir = uf.get_git_root(os.path.dirname(__file__)) directory = os.path.join(git_root_dir, 'src', 'home-credit-default-risk') load_datasets(dbm, directory)
def main(): """Get sqlalchemy table definition""" args = get_args() if 'postgresql' not in args.db_url: print("Does not look like you passed in a valid postgres url.") else: dbm = DBManager(db_url=args.db_url) with dbm.engine.begin() as conn: write_file(conn, args.table_name, args.output_file) print('Done writing table definitions to file', args.output_file)
def main(): """Execute Stuff""" print( 'Running noteevents_with_topics to read model objects and label note events' ) args = get_args() dbm = DBManager(db_url=args.db_url) print('Loading DataFrame') df = dbm.load_query_table(QUERY) print('Loading Dictionary, and LDA Model Objects!') dictionary_pickle = open('./inventory/dictionary.obj', 'rb') lda_model_pickle = open('./inventory/lda_model.obj', 'rb') lda_model_tfidf_pickle = open('./inventory/lda_model_tfidf.obj', 'rb') dictionary = pickle.load(dictionary_pickle) lda_model = pickle.load(lda_model_pickle) lda_model_tfidf = pickle.load(lda_model_tfidf_pickle) print('Using LDA Model Objects to Label Notes!') df_topic_score = label_charts_with_topics(df, dictionary, lda_model, lda_model_tfidf) print( 'Done Labeling Notes! Now saving the DF Locally just in case write to DB Fails' ) noteevents_with_topics_df_pickle = open( './inventory/noteevents_with_topics_df.obj', 'wb') pickle.dump(df_topic_score, noteevents_with_topics_df_pickle) print('Finally, writing to DB!') dbm.write_df_table(df_topic_score, table_name='noteevents_with_topics', schema='features', if_exists='replace', use_fast=True) print('Done Writing to DB!')
def main(): """Execute Stuff""" print('Running noteevents_lda_models to perform topic modeling') args = get_args() dbm = DBManager(db_url=args.db_url) print('Loading DataFrame') df = dbm.load_query_table(QUERY) print('Successfully Loaded DataFrame, now running LDA!') processed_docs = process_docs(df) dictionary = return_dictionary(processed_docs) lda_model, lda_model_tfidf = run_lda(processed_docs, dictionary) print('Finished Running LDA! Now Writing objects to disk!') dictionary_pickle = open('./inventory/dictionary.obj', 'wb') lda_model_pickle = open('./inventory/lda_model.obj', 'wb') lda_model_tfidf_pickle = open('./inventory/lda_model_tfidf.obj', 'wb') pickle.dump(dictionary, dictionary_pickle) pickle.dump(lda_model, lda_model_pickle) pickle.dump(lda_model_tfidf, lda_model_tfidf_pickle)
def main(): """Execute Stuff""" args = get_args() dbm = DBManager(db_url=args.db_url) columns = [[ 'sba_sfdo_id', 'borr_name', 'borr_street', 'borr_city', 'borr_state', 'borr_zip', 'full_address' ]] # print('Getting Yelp data from Yelp API') # sfdo_yelp = get_yelp_fields(dbm) # print('Getting Geocoded fields from Google Maps API') sfdo_geocoded = get_geocoded_fields(dbm) print(sfdo_geocoded)
def main(): """Main function to run tasks.""" args = get_args() dbm = DBManager(db_url=args.db_url) print('\n' + '\n' + 'Started at ' + str(starttime)) print('\n') # Define the list of tasks that are parse tasks parse_tasks = [ 'parse.00_01_01_load_sba_datasets.py', 'parse/00_01_02_sba__foia_7a_1991_1999', 'parse/00_01_03_sba__foia_7a_2000_2009', 'parse/00_01_04_sba__foia_7a_2010_present', 'parse/00_01_05_sba__foia_504_1991_present', 'parse.00_02_01_load_census_datasets.py', 'parse/00_02_02_census__zip_business_patterns', 'parse.00_03_01_load_irs_datasets.py', 'parse/00_03_02_irs__zip_data', ] # Define list of files you want to run tasks = [ # 'queries/stg_analytics/00_00_create_schema', # 'queries/stg_analytics/00_01_01_sba_sfdo_zips', # 'queries/stg_analytics/00_01_02_sba_sfdo', 'queries.stg_analytics.sba_sfdo_api_calls.py', # 'queries/stg_analytics/00_01_04_sba_sfdo_all', # 'queries/stg_analytics/00_02_irs_income', # 'queries/stg_analytics/00_03_census_naics', # 'queries/trg_analytics/00_01_sba_region_level', ] if args.run_parse: files = parse_tasks + tasks else: files = tasks # Run files run_files(dbm, files, args.db_url) endtime = dt.datetime.now() duration = endtime - starttime print('Ended at: ' + str(endtime)) print('Total Runtime: ' + str(duration)) print('Done!') print('\n')
def main(): """Main function to run tasks.""" args = get_args() dbm = DBManager(db_url=args.db_url) print('\n' + '\n' + 'Started at ' + str(starttime)) print('\n') # Define the list of tasks that are parse tasks parse_tasks = [ 'parse.load_ccs_map.py', ] # Define list of files you want to run tasks = [ # 'queries/features/create_schema', # 'queries/datasets/create_schema', # 'queries.features.noteevents_lda_models.py', # 'queries.features.noteevents_with_topics.py', # 'queries/features/add_indexes_noteevents_with_topics', # 'queries/datasets/admissions_diagnoses_icd_ccs_mapping', # 'queries.datasets.admissions_ccs_ohe.py', # 'queries.datasets.admissions_topic_scores.py', # 'queries/datasets/model_demog', # 'queries/datasets/model_demog_dx', 'queries/datasets/model_demog_dx_notetopics', ] if args.run_parse: files = parse_tasks + tasks else: files = tasks print('files are') print(files) # Run files run_files(dbm, files, args.db_url) endtime = dt.datetime.now() duration = endtime - starttime print('Ended at: ' + str(endtime)) print('Total Runtime: ' + str(duration)) print('Done!') print('\n')
def main(): """Main function to run tasks.""" args = get_args() dbm = DBManager(db_url=args.db_url) print('\n' + '\n' + 'Started at ' + str(starttime)) print('\n') # Define the list of tasks that are parse tasks parse_tasks = [ 'parse.load_datasf_campaign_finance_proof_of_concept.py', # 'parse.load_datasf_campaign_finance.py', # 'parse.load_maplight_california.py', # 'parse.clean_casos_california_statewide_election_results.py', # 'parse.load_casos_california_statewide_election_results.py', # 'parse.load_ceda_california_local_election_results.py', ] # Define list of files you want to run tasks = [ # 'queries/stg_analytics/create_schema', # 'queries/stg_analytics/stg_candidate_contributions', # 'queries/stg_analytics/stg_candidate_election_results', # 'queries/trg_analytics/create_schema', # 'queries/trg_analytics/candidate_contributions', ] if args.run_parse: files = parse_tasks + tasks else: files = tasks print('files are') print(files) # Run files run_files(dbm, files, args.db_url) endtime = dt.datetime.now() duration = endtime - starttime print('Ended at: ' + str(endtime)) print('Total Runtime: ' + str(duration)) print('Done!') print('\n')
def main(): """Execute Stuff""" print( 'Running noteevents_with_topics to read model objects and label note events' ) args = get_args() dbm = DBManager(db_url=args.db_url) conn = hive.Connection(host='localhost', port=10000, auth='NOSASL') print('Loading DataFrame') # df = dbm.load_query_table(QUERY) df = pd.read_sql(QUERY, conn) print('Loading Dictionary, and LDA Model Objects!') dictionary_pickle = open('./inventory/dictionary.obj', 'rb') lda_model_pickle = open('./inventory/lda_model.obj', 'rb') lda_model_tfidf_pickle = open('./inventory/lda_model_tfidf.obj', 'rb') dictionary = pickle.load(dictionary_pickle) lda_model = pickle.load(lda_model_pickle) lda_model_tfidf = pickle.load(lda_model_tfidf_pickle) print('Using LDA Model Objects to Label Notes!') df_topic_score = label_charts_with_topics(df, dictionary, lda_model, lda_model_tfidf) print( 'Done Labeling Notes! Now saving the DF Locally just in case write to DB Fails' ) noteevents_with_topics_df_pickle = open( './inventory/noteevents_with_topics_df.obj', 'wb') pickle.dump(df_topic_score, noteevents_with_topics_df_pickle) print('Finally, writing to DB!') # Writing back to HIVE is hard from Pandas so first output to CSV df_topic_score.to_csv('./inventory/noteevents_with_topics.csv', index=False) print('Done Writing to DB!')
def main(): """Main function to run tasks.""" pandas_display_screen_widen() args = get_args() dbm = DBManager(db_url=args.db_url) print('\n' + '\n' + 'Started at ' + str(starttime)) print('\n') # Define list of tables you want to run files = [] # Run files run_files(dbm, files, args.db_url, start_file=None) endtime = dt.datetime.now() duration = endtime - starttime print('Ended at: ' + str(endtime)) print('Total Runtime: ' + str(duration)) print('Done!') print('\n')
def main(): """Main function to run tasks.""" args = get_args() dbm = DBManager(db_url=args.db_url) print(SQL_PATH) print('\n' + '\n' + 'Started at ' + str(starttime)) print('\n') # Define list of files you want to run files = [ # 'parse.00_01_01_load_sba_datasets.py', # 'parse/00_01_02_sba__foia_7a_1991_1999', # 'parse/00_01_03_sba__foia_7a_2000_2009', # 'parse/00_01_04_sba__foia_7a_2010_present', # 'parse/00_01_05_sba__foia_504_1991_present', # 'parse.00_02_01_load_census_datasets.py' # 'parse/00_02_02_census__zip_business_patterns', # '00_03_01_load_irs_datasets.py', # 'parse/00_03_02_irs__zip_data', 'queries/stg_analytics/00_01_01_sba_sfdo_zips', 'queries/stg_analytics/00_01_sba_sfdo', 'queries/stg_analytics/00_02_irs_income', 'queries/stg_analytics/00_03_census_naics', 'queries/trg_analytics/00_01_sba_metrics', ] # Run files run_files(dbm, files, args.db_url) endtime = dt.datetime.now() duration = endtime - starttime print('Ended at: ' + str(endtime)) print('Total Runtime: ' + str(duration)) print('Done!') print('\n')
def main(): """Main function to run tasks.""" args = get_args() dbm = DBManager(db_url=args.db_url) print('\n' + '\n' + 'Started at ' + str(starttime)) print('\n') # Define the list of tasks that are parse tasks parse_tasks = [] # Define list of files you want to run tasks = [ './etl/raw/hdfs_setup.sh', './etl/raw/admissions.hql', './etl/raw/callout.hql', './etl/raw/caregivers.hql', './etl/raw/ccs_dx_map.hql', './etl/raw/ccs_proc_map.hql', './etl/raw/chartevents.hql', './etl/raw/cptevents.hql', './etl/raw/d_cpt.hql', './etl/raw/d_icd_diagnoses.hql', './etl/raw/d_icd_procedures.hql', './etl/raw/d_items.hql', './etl/raw/d_labitems.hql', './etl/raw/datetimeevents.hql', './etl/raw/diagnoses_icd.hql', './etl/raw/drgcodes.hql', './etl/raw/icustays.hql', './etl/raw/inputevents_cv.hql', './etl/raw/inputevents_mv.hql', './etl/raw/labevents.hql', './etl/raw/microbiologyevents.hql', './etl/raw/noteevents.hql', './etl/raw/outputevents.hql', './etl/raw/patients.hql', './etl/raw/prescriptions.hql', './etl/raw/procedureevents_mv.hql', './etl/raw/procedures_icd.hql', './etl/raw/services.hql', './etl/raw/transfers.hql', 'queries.features.noteevents_lda_models.py', 'queries.features.noteevents_with_topics.py', './etl/model/noteevents_with_topics.hql', 'queries/datasets/admissions_diagnoses_icd_ccs_mapping.hql', 'queries.datasets.admissions_ccs_ohe.py', './etl/model/admissions_ccs_ohe.hql', 'queries.datasets.admissions_topic_scores.py', './etl/model/admissions_topic_scores.hql', 'queries/datasets/model_demog.hql', 'queries/datasets/model_demog_dx.hql', 'queries/datasets/model_demog_dx_notetopics.hql', ] if args.run_parse: files = parse_tasks + tasks else: files = tasks print('files are') print(files) # Run files run_files(dbm, files, args.db_url) endtime = dt.datetime.now() duration = endtime - starttime print('Ended at: ' + str(endtime)) print('Total Runtime: ' + str(duration)) print('Done!') print('\n')