예제 #1
0
def main():
    """Execute Stuff"""
    print('Running admissions_ccs_ohe.py. This file performs One Hot Encoding for Admissions and Diagnoses with Higher Level categorization with CCS Codes')
    args = get_args()
    dbm = DBManager(db_url=args.db_url)
    df = perform_ohe()
    dbm.write_df_table(
        df,
        table_name='admissions_ccs_ohe',
        schema='datasets',
        if_exists='replace')
예제 #2
0
def main():
    """Execute Stuff"""
    print('Running admissions_topic_scores.py')
    args = get_args()
    dbm = DBManager(db_url=args.db_url)

    print('Loading DataFrame')
    df = create_adm_topic_features()
    
    print('Successfully Loaded DataFrame now writing to DB!')
    dbm.write_df_table(
        df,
        table_name='admissions_topic_scores',
        schema='datasets',
        if_exists='replace')
예제 #3
0
def main():
    """Execute Stuff"""
    print('Parsing Census datasets')
    args = get_args()
    dbm = DBManager(db_url=args.db_url)
    directory = 'C:/Users/User/Dropbox/Documents/Analytics/Analyses/SBA/data/'
    load_irs_data(dbm, directory)
def main():
    """Execute Stuff"""
    print('Parsing FOIA datasets')
    args = get_args()
    dbm = DBManager(db_url=args.db_url)
    directory = '/Users/VincentLa/git/datasci-sba/src/data/sba/'
    load_sba_datasets(dbm, directory)
예제 #5
0
def main():
    """Execute Stuff"""
    print('Parsing and Loading SF Data Campaign Finance Datasets')
    args = get_args()
    dbm = DBManager(db_url=args.db_url)
    git_root_dir = uf.get_git_root(os.path.dirname(__file__))
    directory = os.path.join(git_root_dir, 'src')
    load_datasets(dbm, directory)
예제 #6
0
def main():
    """Execute Stuff"""
    print('Cleaning California Statewide Election Results Data')
    args = get_args()
    dbm = DBManager(db_url=args.db_url)
    git_root_dir = uf.get_git_root(os.path.dirname(__file__))
    directory = os.path.join(git_root_dir, 'src', 'casos')
    clean_datasets(dbm, directory)
def main():
    """Execute Stuff"""
    print('Parsing and Loading MapLight California Bulk Data Sets')
    args = get_args()
    dbm = DBManager(db_url=args.db_url)
    git_root_dir = uf.get_git_root(os.path.dirname(__file__))
    directory = os.path.join(git_root_dir, 'src', 'maplight')
    load_datasets(dbm, directory)
예제 #8
0
def main():
    """Execute Stuff"""
    print('Parsing IRS datasets')
    args = get_args()
    dbm = DBManager(db_url=args.db_url)
    git_root_dir = uf.get_git_root(os.path.dirname(__file__))
    directory = os.path.join(git_root_dir, 'src', 'data', 'irs')
    load_irs_data(dbm, directory)
def main():
    """Execute Stuff"""
    print('Parsing and Loading California Local Election Results Data')
    args = get_args()
    dbm = DBManager(db_url=args.db_url)
    git_root_dir = uf.get_git_root(os.path.dirname(__file__))
    directory = os.path.join(git_root_dir, 'src', 'ceda')
    load_datasets(dbm, directory)
def main():
    """Execute Stuff"""
    print('Parsing and Loading Kaggle Competition Datasets')
    args = get_args()
    dbm = DBManager(db_url=args.db_url)
    git_root_dir = uf.get_git_root(os.path.dirname(__file__))
    directory = os.path.join(git_root_dir, 'src', 'home-credit-default-risk')
    load_datasets(dbm, directory)
예제 #11
0
def main():
    """Get sqlalchemy table definition"""
    args = get_args()

    if 'postgresql' not in args.db_url:
        print("Does not look like you passed in a valid postgres url.")
    else:
        dbm = DBManager(db_url=args.db_url)
        with dbm.engine.begin() as conn:
            write_file(conn, args.table_name, args.output_file)
    print('Done writing table definitions to file', args.output_file)
예제 #12
0
def main():
    """Execute Stuff"""
    print(
        'Running noteevents_with_topics to read model objects and label note events'
    )
    args = get_args()
    dbm = DBManager(db_url=args.db_url)

    print('Loading DataFrame')
    df = dbm.load_query_table(QUERY)

    print('Loading Dictionary, and LDA Model Objects!')
    dictionary_pickle = open('./inventory/dictionary.obj', 'rb')
    lda_model_pickle = open('./inventory/lda_model.obj', 'rb')
    lda_model_tfidf_pickle = open('./inventory/lda_model_tfidf.obj', 'rb')

    dictionary = pickle.load(dictionary_pickle)
    lda_model = pickle.load(lda_model_pickle)
    lda_model_tfidf = pickle.load(lda_model_tfidf_pickle)

    print('Using LDA Model Objects to Label Notes!')
    df_topic_score = label_charts_with_topics(df, dictionary, lda_model,
                                              lda_model_tfidf)

    print(
        'Done Labeling Notes! Now saving the DF Locally just in case write to DB Fails'
    )
    noteevents_with_topics_df_pickle = open(
        './inventory/noteevents_with_topics_df.obj', 'wb')
    pickle.dump(df_topic_score, noteevents_with_topics_df_pickle)

    print('Finally, writing to DB!')
    dbm.write_df_table(df_topic_score,
                       table_name='noteevents_with_topics',
                       schema='features',
                       if_exists='replace',
                       use_fast=True)

    print('Done Writing to DB!')
예제 #13
0
def main():
    """Execute Stuff"""
    print('Running noteevents_lda_models to perform topic modeling')
    args = get_args()
    dbm = DBManager(db_url=args.db_url)

    print('Loading DataFrame')
    df = dbm.load_query_table(QUERY)

    print('Successfully Loaded DataFrame, now running LDA!')
    processed_docs = process_docs(df)
    dictionary = return_dictionary(processed_docs)
    lda_model, lda_model_tfidf = run_lda(processed_docs, dictionary)

    print('Finished Running LDA! Now Writing objects to disk!')
    dictionary_pickle = open('./inventory/dictionary.obj', 'wb')
    lda_model_pickle = open('./inventory/lda_model.obj', 'wb')
    lda_model_tfidf_pickle = open('./inventory/lda_model_tfidf.obj', 'wb')

    pickle.dump(dictionary, dictionary_pickle)
    pickle.dump(lda_model, lda_model_pickle)
    pickle.dump(lda_model_tfidf, lda_model_tfidf_pickle)
예제 #14
0
def main():
    """Execute Stuff"""
    args = get_args()
    dbm = DBManager(db_url=args.db_url)

    columns = [[
        'sba_sfdo_id', 'borr_name', 'borr_street', 'borr_city', 'borr_state',
        'borr_zip', 'full_address'
    ]]

    # print('Getting Yelp data from Yelp API')
    # sfdo_yelp = get_yelp_fields(dbm)

    # print('Getting Geocoded fields from Google Maps API')
    sfdo_geocoded = get_geocoded_fields(dbm)
    print(sfdo_geocoded)
예제 #15
0
def main():
    """Main function to run tasks."""
    args = get_args()
    dbm = DBManager(db_url=args.db_url)
    print('\n' + '\n' + 'Started at ' + str(starttime))
    print('\n')

    # Define the list of tasks that are parse tasks
    parse_tasks = [
        'parse.00_01_01_load_sba_datasets.py',
        'parse/00_01_02_sba__foia_7a_1991_1999',
        'parse/00_01_03_sba__foia_7a_2000_2009',
        'parse/00_01_04_sba__foia_7a_2010_present',
        'parse/00_01_05_sba__foia_504_1991_present',
        'parse.00_02_01_load_census_datasets.py',
        'parse/00_02_02_census__zip_business_patterns',
        'parse.00_03_01_load_irs_datasets.py',
        'parse/00_03_02_irs__zip_data',
    ]

    # Define list of files you want to run
    tasks = [
        # 'queries/stg_analytics/00_00_create_schema',
        # 'queries/stg_analytics/00_01_01_sba_sfdo_zips',
        # 'queries/stg_analytics/00_01_02_sba_sfdo',
        'queries.stg_analytics.sba_sfdo_api_calls.py',
        # 'queries/stg_analytics/00_01_04_sba_sfdo_all',
        # 'queries/stg_analytics/00_02_irs_income',
        # 'queries/stg_analytics/00_03_census_naics',
        # 'queries/trg_analytics/00_01_sba_region_level',
    ]

    if args.run_parse:
        files = parse_tasks + tasks
    else:
        files = tasks

    # Run files
    run_files(dbm, files, args.db_url)

    endtime = dt.datetime.now()
    duration = endtime - starttime

    print('Ended at: ' + str(endtime))
    print('Total Runtime: ' + str(duration))
    print('Done!')
    print('\n')
예제 #16
0
def main():
    """Main function to run tasks."""
    args = get_args()
    dbm = DBManager(db_url=args.db_url)
    print('\n' + '\n' + 'Started at ' + str(starttime))
    print('\n')

    # Define the list of tasks that are parse tasks
    parse_tasks = [
        'parse.load_ccs_map.py',
    ]

    # Define list of files you want to run
    tasks = [
        # 'queries/features/create_schema',
        # 'queries/datasets/create_schema',
        # 'queries.features.noteevents_lda_models.py',
        # 'queries.features.noteevents_with_topics.py',
        # 'queries/features/add_indexes_noteevents_with_topics',
        # 'queries/datasets/admissions_diagnoses_icd_ccs_mapping',
        # 'queries.datasets.admissions_ccs_ohe.py',
        # 'queries.datasets.admissions_topic_scores.py',
        # 'queries/datasets/model_demog',
        # 'queries/datasets/model_demog_dx',
        'queries/datasets/model_demog_dx_notetopics',
    ]

    if args.run_parse:
        files = parse_tasks + tasks
    else:
        files = tasks

    print('files are')
    print(files)
    # Run files
    run_files(dbm, files, args.db_url)

    endtime = dt.datetime.now()
    duration = endtime - starttime

    print('Ended at: ' + str(endtime))
    print('Total Runtime: ' + str(duration))
    print('Done!')
    print('\n')
예제 #17
0
def main():
    """Main function to run tasks."""
    args = get_args()
    dbm = DBManager(db_url=args.db_url)
    print('\n' + '\n' + 'Started at ' + str(starttime))
    print('\n')

    # Define the list of tasks that are parse tasks
    parse_tasks = [
        'parse.load_datasf_campaign_finance_proof_of_concept.py',
        # 'parse.load_datasf_campaign_finance.py',
        # 'parse.load_maplight_california.py',
        # 'parse.clean_casos_california_statewide_election_results.py',
        # 'parse.load_casos_california_statewide_election_results.py',
        # 'parse.load_ceda_california_local_election_results.py',
    ]

    # Define list of files you want to run
    tasks = [
        # 'queries/stg_analytics/create_schema',
        # 'queries/stg_analytics/stg_candidate_contributions',
        # 'queries/stg_analytics/stg_candidate_election_results',
        # 'queries/trg_analytics/create_schema',
        # 'queries/trg_analytics/candidate_contributions',
    ]

    if args.run_parse:
        files = parse_tasks + tasks
    else:
        files = tasks

    print('files are')
    print(files)
    # Run files
    run_files(dbm, files, args.db_url)

    endtime = dt.datetime.now()
    duration = endtime - starttime

    print('Ended at: ' + str(endtime))
    print('Total Runtime: ' + str(duration))
    print('Done!')
    print('\n')
예제 #18
0
def main():
    """Execute Stuff"""
    print(
        'Running noteevents_with_topics to read model objects and label note events'
    )
    args = get_args()
    dbm = DBManager(db_url=args.db_url)
    conn = hive.Connection(host='localhost', port=10000, auth='NOSASL')

    print('Loading DataFrame')
    # df = dbm.load_query_table(QUERY)
    df = pd.read_sql(QUERY, conn)

    print('Loading Dictionary, and LDA Model Objects!')
    dictionary_pickle = open('./inventory/dictionary.obj', 'rb')
    lda_model_pickle = open('./inventory/lda_model.obj', 'rb')
    lda_model_tfidf_pickle = open('./inventory/lda_model_tfidf.obj', 'rb')

    dictionary = pickle.load(dictionary_pickle)
    lda_model = pickle.load(lda_model_pickle)
    lda_model_tfidf = pickle.load(lda_model_tfidf_pickle)

    print('Using LDA Model Objects to Label Notes!')
    df_topic_score = label_charts_with_topics(df, dictionary, lda_model,
                                              lda_model_tfidf)

    print(
        'Done Labeling Notes! Now saving the DF Locally just in case write to DB Fails'
    )
    noteevents_with_topics_df_pickle = open(
        './inventory/noteevents_with_topics_df.obj', 'wb')
    pickle.dump(df_topic_score, noteevents_with_topics_df_pickle)

    print('Finally, writing to DB!')
    # Writing back to HIVE is hard from Pandas so first output to CSV
    df_topic_score.to_csv('./inventory/noteevents_with_topics.csv',
                          index=False)

    print('Done Writing to DB!')
예제 #19
0
def main():
    """Main function to run tasks."""
    pandas_display_screen_widen()
    args = get_args()
    dbm = DBManager(db_url=args.db_url)

    print('\n' + '\n' + 'Started at ' + str(starttime))
    print('\n')

    # Define list of tables you want to run
    files = []

    # Run files
    run_files(dbm, files, args.db_url, start_file=None)

    endtime = dt.datetime.now()

    duration = endtime - starttime

    print('Ended at: ' + str(endtime))
    print('Total Runtime: ' + str(duration))
    print('Done!')
    print('\n')
예제 #20
0
def main():
    """Main function to run tasks."""
    args = get_args()
    dbm = DBManager(db_url=args.db_url)
    print(SQL_PATH)
    print('\n' + '\n' + 'Started at ' + str(starttime))
    print('\n')

    # Define list of files you want to run
    files = [
        # 'parse.00_01_01_load_sba_datasets.py',
        # 'parse/00_01_02_sba__foia_7a_1991_1999',
        # 'parse/00_01_03_sba__foia_7a_2000_2009',
        # 'parse/00_01_04_sba__foia_7a_2010_present',
        # 'parse/00_01_05_sba__foia_504_1991_present',
        # 'parse.00_02_01_load_census_datasets.py'
        # 'parse/00_02_02_census__zip_business_patterns',
        # '00_03_01_load_irs_datasets.py',
        # 'parse/00_03_02_irs__zip_data',
        'queries/stg_analytics/00_01_01_sba_sfdo_zips',
        'queries/stg_analytics/00_01_sba_sfdo',
        'queries/stg_analytics/00_02_irs_income',
        'queries/stg_analytics/00_03_census_naics',
        'queries/trg_analytics/00_01_sba_metrics',
    ]

    # Run files
    run_files(dbm, files, args.db_url)

    endtime = dt.datetime.now()

    duration = endtime - starttime

    print('Ended at: ' + str(endtime))
    print('Total Runtime: ' + str(duration))
    print('Done!')
    print('\n')
예제 #21
0
def main():
    """Main function to run tasks."""
    args = get_args()
    dbm = DBManager(db_url=args.db_url)
    print('\n' + '\n' + 'Started at ' + str(starttime))
    print('\n')

    # Define the list of tasks that are parse tasks
    parse_tasks = []

    # Define list of files you want to run
    tasks = [
        './etl/raw/hdfs_setup.sh',
        './etl/raw/admissions.hql',
        './etl/raw/callout.hql',
        './etl/raw/caregivers.hql',
        './etl/raw/ccs_dx_map.hql',
        './etl/raw/ccs_proc_map.hql',
        './etl/raw/chartevents.hql',
        './etl/raw/cptevents.hql',
        './etl/raw/d_cpt.hql',
        './etl/raw/d_icd_diagnoses.hql',
        './etl/raw/d_icd_procedures.hql',
        './etl/raw/d_items.hql',
        './etl/raw/d_labitems.hql',
        './etl/raw/datetimeevents.hql',
        './etl/raw/diagnoses_icd.hql',
        './etl/raw/drgcodes.hql',
        './etl/raw/icustays.hql',
        './etl/raw/inputevents_cv.hql',
        './etl/raw/inputevents_mv.hql',
        './etl/raw/labevents.hql',
        './etl/raw/microbiologyevents.hql',
        './etl/raw/noteevents.hql',
        './etl/raw/outputevents.hql',
        './etl/raw/patients.hql',
        './etl/raw/prescriptions.hql',
        './etl/raw/procedureevents_mv.hql',
        './etl/raw/procedures_icd.hql',
        './etl/raw/services.hql',
        './etl/raw/transfers.hql',
        'queries.features.noteevents_lda_models.py',
        'queries.features.noteevents_with_topics.py',
        './etl/model/noteevents_with_topics.hql',
        'queries/datasets/admissions_diagnoses_icd_ccs_mapping.hql',
        'queries.datasets.admissions_ccs_ohe.py',
        './etl/model/admissions_ccs_ohe.hql',
        'queries.datasets.admissions_topic_scores.py',
        './etl/model/admissions_topic_scores.hql',
        'queries/datasets/model_demog.hql',
        'queries/datasets/model_demog_dx.hql',
        'queries/datasets/model_demog_dx_notetopics.hql',
    ]

    if args.run_parse:
        files = parse_tasks + tasks
    else:
        files = tasks

    print('files are')
    print(files)
    # Run files
    run_files(dbm, files, args.db_url)

    endtime = dt.datetime.now()
    duration = endtime - starttime

    print('Ended at: ' + str(endtime))
    print('Total Runtime: ' + str(duration))
    print('Done!')
    print('\n')