import numpy as np import pandas as pd from pandas import DataFrame from sqlalchemy import sql print("Reading faculty dimension data") df = pd.read_csv('data/ipeds_faculty_dimension.csv') session = Session() try: print('Populating dimension tables.') # insert state data record_deletes = session.query(IpedsFacultyDimension).delete( synchronize_session=False) df_inserts = session.bulk_insert_mappings( mapper=IpedsFacultyDimension, mappings=df.to_dict(orient='records')), print('\tFinished populating counties table.') except Exception as e: session.rollback() print( "\tAn error occurred and no data were changed in the database.\n\nError:\n{}" .format(str(e))) else: session.commit() print('\tChanges committed to database.') session.close() print("All Done.")
import numpy as np import pandas as pd from pandas import DataFrame from sqlalchemy import sql print("Reading csa data") df = pd.read_csv('data/csa.csv') session = Session() try: # bulk insert objects for federal agencies and academic fields (NSF) print('\nPopulating dimension tables.') # insert state data record_deletes = session.query(Csa).delete(synchronize_session=False) df_inserts = session.bulk_insert_mappings( mapper=Csa, mappings=df.to_dict(orient='records')), print('\tFinished populating csa table.') except Exception as e: session.rollback() print( "\nAn error occurred and no data were changed in the database.\n\nError:\n{}" .format(str(e))) else: session.commit() print('\nChanges committed to database.') session.close() print("\nAll Done.")
def main(): print("Reading carnegie_classes data") labels = get_sheet('Labels') labels.columns = labels.columns.str.strip().str.lower() labels = labels.rename( columns={ 'variable': 'classification_id', 'label': 'classification', 'value': 'class_code', 'label.1': 'carnegie_class' }) labels = labels[labels.class_code.notnull()] labels = labels.ffill() vals = get_sheet('Data') # reshape from wide to long format carnegie = pd.melt(vals, id_vars=['UNITID'], var_name='classification_id', value_vars=[ 'BASIC2005', 'BASIC2010', 'BASIC2015', 'BASIC2018', 'IPUG2018', 'IPGRAD2018', 'ENRPROFILE2018', 'UGPROFILE2018', 'SIZESET2018', 'CCE2015', ], value_name='class_code') carnegie = carnegie.rename(columns={'UNITID': 'unitid'}) carnegie = carnegie.merge(labels, on=['class_code', 'classification_id'], how='inner') session = Session() try: print('Populating dimension tables.') # insert state data record_deletes = session.query(CarnegieClass).delete( synchronize_session=False) df_inserts = session.bulk_insert_mappings( mapper=CarnegieClass, mappings=carnegie.to_dict(orient='records')), print('\tFinished populating carnegie_classes table.') except Exception as e: session.rollback() print( "\tAn error occurred and no data were changed in the database.\n\nError:\n{}" .format(str(e))) print("\tNo changed made to database.\n") else: session.commit() print('\tChanges committed to database.\n') session.close() print("All Done.")
import pandas as pd from pandas import DataFrame from sqlalchemy import sql print("Reading ipeds_custom_peer_lists data") df = pd.read_csv('data/ipeds_custom_peer_lists.csv') session = Session() try: # bulk insert objects for federal agencies and academic fields (NSF) print('\nPopulating dimension tables.') # insert state data record_deletes = session.query(IpedsCustomPeerList).delete( synchronize_session=False) df_inserts = session.bulk_insert_mappings( mapper=IpedsCustomPeerList, mappings=df.to_dict(orient='records')), print('\tFinished populating ipeds_custom_peer_lists table.') except Exception as e: session.rollback() print( "\nAn error occurred and no data were changed in the database.\n\nError:\n{}" .format(str(e))) else: session.commit() print('\nChanges committed to database.') session.close() print("\nAll Done.")
import numpy as np import pandas as pd from pandas import DataFrame from sqlalchemy import sql print("Reading athletic_conferences data") df = pd.read_csv('data/athletic_conferences.csv') session = Session() try: print('\nPopulating dimension tables.') # insert state data record_deletes = session.query(AthleticConference).delete( synchronize_session=False) df_inserts = session.bulk_insert_mappings( mapper=AthleticConference, mappings=df.to_dict(orient='records')), print('\tFinished populating athletic_conferences table.') except Exception as e: session.rollback() print( "\nAn error occurred and no data were changed in the database.\n\nError:\n{}" .format(str(e))) else: session.commit() print('\nChanges committed to database.') session.close() print("\nAll Done.")
import pandas as pd from pandas import DataFrame from sqlalchemy import sql print("Reading ipeds_demographic_dimension data") df = pd.read_csv('data/ipeds_demographic_dimension.csv') session = Session() try: print('\nPopulating dimension tables.') # insert state data record_deletes = session.query(IpedsDemographicDimension).delete( synchronize_session=False) df_inserts = session.bulk_insert_mappings( mapper=IpedsDemographicDimension, mappings=df.to_dict(orient='records')), print('\tFinished populating ipeds_demographic_dimension table.') except Exception as e: session.rollback() print( "\nAn error occurred and no data were changed in the database.\n\nError:\n{}" .format(str(e))) else: session.commit() print('\nChanges committed to database.') session.close() print("\nAll Done.")
cip4 = cip[cip.cipcode.str.len() == 5][['cipcode', 'title']] cip4 = cip4.rename(columns={'cipcode': 'cip4', 'title': 'cip4_description'}) hierarchy = cip6.merge(right=cip2, how='left', on='cip2') hierarchy = hierarchy.merge(right=cip4, how='left', on='cip4') hierarchy.cip4_description = hierarchy.cip4_description.fillna('Unknown') hierarchy.cip2_description = hierarchy.cip2_description.fillna('Unknown') try: print('Populating dimension table.') # insert cip data record_deletes = session.query(Cip).delete(synchronize_session=False) df_inserts = session.bulk_insert_mappings( mapper=Cip, mappings=hierarchy.to_dict(orient='records')), print('\tFinished populating cip_hierarchy table.') except Exception as e: session.rollback() print( "\tAn error occurred and no data were changed in the database.\n\nError:\n{}" .format(str(e))) print('\tNo changes were made to the database due to error(s).') else: session.commit() print('\tChanges committed to database.\n') session.close() print("All Done.")
def main(): for year in np.arange(args.first, args.last + 1): try: spec = f'data/nsf_{year}.pickle' print(f'Reading data for fiscal year ending {year}...', end='', flush=True) with open(spec, 'rb') as f: herd = pickle.load(f) except Exception as e: print(f'ERROR.\nFile not downloaded properly.\n\n{str(e)}\n') else: print('DONE.') # herd.info() # set date key date_key = f'{year}-06-30' # modify data frame to apply needed fixes herd['date_key'] = date_key # convert id's and add missing values herd.ncses_inst_id = herd.ncses_inst_id.fillna('XXXXXXXX') # get unitid fixes with open('data/inst_id_fixes.pickle', 'rb') as f: fixes = pickle.load(f) # apply fixes to unitid column, fill missing, and convert to integer herd['unitid'] = herd.inst_id.map(fixes) herd.unitid = np.where(herd.unitid.isna(), herd.ipeds_unitid, herd.unitid) herd.unitid = herd.unitid.fillna(-1).astype(int) herd = herd[herd.column.isin(['DOD', 'DOE', 'HHS', 'NASA', 'NSF', 'USDA', 'Other agencies', 'State and local government', 'Business', 'Institution funds', 'Nonprofit organziations'])] herd['agency_key'] = item_recode(herd.column, {'DOD': 'DOD', 'DOE': 'DOE', 'HHS': 'HHS', 'NASA': 'NAS', 'NSF': 'NSF', 'USDA': 'USD', 'Other agencies': 'OTH', 'State and local government': 'SLG', 'Business': 'BUS', 'Institution funds': 'INS', 'All other sources': 'OTH', 'Nonprofit organziations': 'NPO'}, 'Unknown') herd['funding_type'] = herd.questionnaire_no.str[:2] herd = herd[herd.funding_type.isin(['09', '11'])] herd['funding_type'] = item_recode(herd.funding_type, {'09': 'Federal', '11': 'Non-federal'}, 'Unknown') herd['academic_field_key'] = herd.questionnaire_no.str[2:].str.strip() herd = herd[herd.academic_field_key.isin(['A','B01','B02','B03','B04','B05','B06','B07','B08', 'B09','C01','C02','C03','C04','D01','D02','D03','D04', 'D05','E','F01','F02','F03','F04','F05','G','H01', 'H02','H03','H04','H05','I','J01','J02','J03','J04', 'J05','J06','J07','J08'])] herd['data'] = herd.data.fillna(0) * 1000 # rename columns herd = herd.rename(columns = {'data': 'expenditure'}) # de-duplicate items keepers = ['inst_id', 'date_key', 'funding_type', 'agency_key', 'academic_field_key', 'ncses_inst_id', 'unitid', 'expenditure'] herd = herd[keepers].groupby(['inst_id', 'date_key', 'funding_type', 'agency_key', 'academic_field_key', 'ncses_inst_id', 'unitid']).sum().reset_index() herd['expenditure'] = herd.expenditure.fillna(0) # herd = herd.fillna(sql.null()) # insert data into dbo.survey_records session = Session() try: print(f'Attempting to insert {herd.shape[0]:,} rows for {year} into {NsfHerdDetail.__tablename__}.') record_deletes = session.query(NsfHerdDetail).filter(NsfHerdDetail.date_key==date_key).delete(synchronize_session=False) session.bulk_insert_mappings(mapper = NsfHerdDetail, mappings = herd.to_dict(orient='records'), render_nulls = True) except Exception as e: session.rollback() print(str(e)) print('No data were altered due to error.\n') else: session.commit() print(f'\t{record_deletes:,} old records were deleted.') print(f'\t{herd.shape[0]:,} new records were inserted.\n') finally: session.close() session = None print('All done!')
import numpy as np import pandas as pd from pandas import DataFrame from sqlalchemy import sql print("Reading nsf_herd_federal_agencies data") df = pd.read_csv('data/nsf_herd_federal_agencies.csv') session = Session() try: print('\nPopulating dimension tables.') # insert state data record_deletes = session.query(NsfHerdFederalAgency).delete( synchronize_session=False) df_inserts = session.bulk_insert_mappings( mapper=NsfHerdFederalAgency, mappings=df.to_dict(orient='records')), print('\tFinished populating nsf_herd_federal_agencies table.') except Exception as e: session.rollback() print( "\nAn error occurred and no data were changed in the database.\n\nError:\n{}" .format(str(e))) else: session.commit() print('\nChanges committed to database.') session.close() print("\nAll Done.")
import numpy as np import pandas as pd from pandas import DataFrame from sqlalchemy import sql print("Reading ipeds_degree_types data") df = pd.read_csv('data/ipeds_degree_types.csv') session = Session() try: print('\nPopulating dimension tables.') # insert state data record_deletes = session.query(IpedsDegreeType).delete( synchronize_session=False) df_inserts = session.bulk_insert_mappings( mapper=IpedsDegreeType, mappings=df.to_dict(orient='records')), print('\tFinished populating ipeds_degree_types table.') except Exception as e: session.rollback() print( "\nAn error occurred and no data were changed in the database.\n\nError:\n{}" .format(str(e))) else: session.commit() print('\nChanges committed to database.') session.close() print("\nAll Done.")
def main(): for year in np.arange(args.first, args.last + 1): try: spec = f'data/nsf_{year}.pickle' print(f'Reading data for fiscal year ending {year}...', end='', flush=True) with open(spec, 'rb') as f: herd = pickle.load(f) except Exception as e: print(f'ERROR.\nFile not downloaded properly.\n\n{str(e)}\n') else: print('DONE.') # herd.info() # set date key date_key = f'{year}-06-30' # modify data frame to apply needed fixes herd['date_key'] = date_key herd['med_sch_flag'] = herd.med_sch_flag.isin( ['T', 'TRUE', 'True', 'true', 't', 'Y', 'Yes', '1']) herd['toi_code'] = herd.toi_code == 1 toc = {1: 'Public', 2: 'Private'} herd['toc_code'] = item_recode(herd['toc_code'], toc, 'Unknown') herd['inst_state_code'] = item_recode(herd['inst_state_code'], state_fips, 0) # convert id's and add missing values herd.ncses_inst_id = herd.ncses_inst_id.fillna('XXXXXXXX') # get unitid fixes with open('data/inst_id_fixes.pickle', 'rb') as f: fixes = pickle.load(f) # apply fixes to unitid column, fill missing, and convert to integer herd['unitid'] = herd.inst_id.map(fixes) herd.unitid = np.where(herd.unitid.isna(), herd.ipeds_unitid, herd.unitid) herd.unitid = herd.unitid.fillna(-1).astype(int) # select questionnaire_no's for institutional aggregate values herd = herd[herd.questionnaire_no.isin([ '01.a', '01.b', '01.c', '01.d', '01.e', '01.f', '01.g', '04', 'NA_01', '15' ])] # data are reported in thousands of dollars - make explicit herd['data'] = np.where(herd.questionnaire_no == '15', herd.data, herd.data.fillna(0) * 1000) # add labels for personnel variables herd.loc[(herd['questionnaire_no'] == '15') & (herd['row'] == 'Principal investigators'), 'questionnaire_no'] = 'principal_investigators' herd.loc[(herd['questionnaire_no'] == '15') & (herd['row'] == 'Other personnel'), 'questionnaire_no'] = 'other_personnel' herd.loc[(herd['questionnaire_no'] == '15') & (herd['row'] == 'Total'), 'questionnaire_no'] = 'research_personnel' keepers = [ 'inst_id', 'date_key', 'unitid', 'ncses_inst_id', 'inst_name_long', 'inst_state_code', 'toc_code', 'toi_code', 'med_sch_flag', 'questionnaire_no', 'data' ] # long to wide institutions = herd.pivot_table(index=[ 'inst_id', 'date_key', 'unitid', 'ncses_inst_id', 'inst_name_long', 'inst_state_code', 'toc_code', 'toi_code', 'med_sch_flag' ], columns='questionnaire_no', values='data', aggfunc=np.sum, fill_value=0).reset_index() # rename columns institutions = institutions.rename( columns={ 'inst_name_long': 'institution_name', 'inst_state_code': 'state_fips', 'toc_code': 'control', 'toi_code': 'academic_institution', 'med_sch_flag': 'medical_school_flag', '01.a': 'federal_government', '01.b': 'state_and_local_government', '01.c': 'business', '01.d': 'nonprofit_organizations', '01.e': 'institutional_funds', '01.f': 'other_sources', '01.g': 'total_rd_expenses', '04': 'medical_school_expenses', 'NA_01': 'arra_funds' }) # replace NaN with database-compliant nulls institutions['research_personnel'] = np.where( institutions.research_personnel == 0, None, institutions.research_personnel) # insert data into dbo.survey_records session = Session() try: print( f'Attempting to insert {institutions.shape[0]:,} rows for {year} into {NsfHerdInstitution.__tablename__}.' ) record_deletes = session.query(NsfHerdInstitution).filter( NsfHerdInstitution.date_key == date_key).delete( synchronize_session=False) session.bulk_insert_mappings( mapper=NsfHerdInstitution, mappings=institutions.to_dict(orient='records'), render_nulls=True) except Exception as e: session.rollback() print(str(e)) print('No data were altered due to error.\n') else: session.commit() print(f'\t{record_deletes:,} old records were deleted.') print(f'\t{institutions.shape[0]:,} new records were inserted.\n') finally: session.close() session = None print('All done!')
import numpy as np import pandas as pd from pandas import DataFrame from sqlalchemy import sql print("Reading nsf_herd_academic_fields data") df = pd.read_csv('data/nsf_herd_academic_fields.csv') session = Session() try: print('\nPopulating dimension tables.') # insert state data record_deletes = session.query(NsfHerdAcademicField).delete( synchronize_session=False) df_inserts = session.bulk_insert_mappings( mapper=NsfHerdAcademicField, mappings=df.to_dict(orient='records')), print('\tFinished populating nsf_herd_academic_fields table.') except Exception as e: session.rollback() print( "\nAn error occurred and no data were changed in the database.\n\nError:\n{}" .format(str(e))) else: session.commit() print('\nChanges committed to database.') session.close() print("\nAll Done.")