def ETL(data): base_path = create_base_path(__file__) Flow(data, update_resource(None, name=table_name), update_resource(resources=table_name, path=table_name + '.csv'), joined_lower(resources=table_name), dump_to_s3(resources=table_name, params=dict(base_path=base_path))).process()
def ETL(table_name): base_path = create_base_path(__file__) file_path = Path(__file__).parent / 'tmp' / f'{table_name}.csv' Flow( load(str(file_path), name=table_name, format='csv', force_strings=True), joined_lower(resources=table_name), dump_to_s3(resources=table_name, params=dict(base_path=base_path))).process()
def ETL(): table_name = 'doe_bluebook' url = 'https://data.cityofnewyork.us/api/views/8b9a-pywy/rows.csv?accessType=DOWNLOAD' base_path = create_base_path(__file__) Flow(load(url, name=table_name, format='csv', force_strings=True), joined_lower(resources=table_name), dump_to_s3(resources=table_name, params=dict(base_path=base_path))).process()
def ETL(): table_name = 'doitt_buildingfootprints' url = 'https://data.cityofnewyork.us/api/views/pkvt-jviv/rows.csv?accessType=DOWNLOAD' base_path = create_base_path(__file__) Flow(load(url, name=table_name, format='csv', force_strings=True), dump_to_s3(resources=table_name, params=dict(base_path=base_path))).process()
def ETL(): table_name = 'usnps_parks' base_path = create_base_path(__file__) file_path = Path(__file__).parent / 'nps_boundry' / 'usnps_parks.csv' Flow( load(str(file_path), name=table_name, format='csv', force_strings=True), joined_lower(resources=table_name), dump_to_s3(resources=table_name, params=dict(base_path=base_path))).process()
def ETL(): table_name = 'hpd_hny_units_by_building' url = 'https://data.cityofnewyork.us/api/views/hg8x-zxpr/rows.csv?accessType=DOWNLOAD' base_path = create_base_path(__file__) Flow(load(url, name=table_name, format='csv', force_strings=True), joined_lower(resources=table_name), dump_to_s3(resources=table_name, params=dict(base_path=base_path))).process()
def ETL(): table_name = 'housing_input_removals' url = 'https://raw.githubusercontent.com/NYCPlanning/db-developments/master/developments_build/data/housing_input_removals.csv' base_path = create_base_path(__file__) Flow(load(url, name=table_name, format='csv', force_strings=False), joined_lower(resources=table_name), add_field('b', 'string', ''), dump_to_s3(resources=table_name, params=dict(base_path=base_path))).process()
def ETL(): table_name = 'bic_tradewaste' url = 'https://data.cityofnewyork.us/api/views/hsjb-p5ky/rows.csv' base_path = create_base_path(__file__) Flow( load(url, name=table_name, format='csv', force_strings=True), joined_lower(resources=table_name), dump_to_s3(resources=table_name, params=dict(base_path=base_path)) ).process()
def ETL(): table_name = 'dsny_mtsgaragemaintenance' base_path = create_base_path(__file__) file_path = Path(__file__).parent / 'tmp' / 'dsny_mtsgaragemaintenance.csv' Flow( load(str(file_path), name=table_name, format='csv', force_strings=True), joined_lower(resources=table_name), dump_to_s3(resources=table_name, params=dict(base_path=base_path))).process()
def ETL(): table_name = 'nycha_communitycenters' url = 'https://data.cityofnewyork.us/api/views/crns-fw6u/rows.csv?accessType=DOWNLOAD' base_path = create_base_path(__file__) Flow(load(url, name=table_name, format='csv', force_strings=True), joined_lower(resources=table_name), dump_to_s3(resources=table_name, params=dict(base_path=base_path))).process()
def ETL(): table_name = 'nysed_nonpublicenrollment' url = 'http://www.p12.nysed.gov/irs/statistics/nonpublic/2018-19_NonPub_EnrollmentbyGrade.xlsx' base_path = create_base_path(__file__) Flow(load(url, name=table_name, format='xlsx', force_strings=True), joined_lower(resources=table_name), update_resource(resources=table_name, path=table_name + '.csv'), dump_to_s3(resources=table_name, params=dict(base_path=base_path))).process()
def ETL(): table_name = 'nysdoh_nursinghomes' url = 'https://health.data.ny.gov/api/views/izta-vnpq/rows.csv?accessType=DOWNLOAD' base_path = create_base_path(__file__) Flow(load(url, name=table_name, format='csv', force_strings=True), joined_lower(resources=table_name), update_resource(resources=table_name, path=table_name + '.csv'), dump_to_s3(resources=table_name, params=dict(base_path=base_path))).process()
def ETL(): table_name = 'dcla_culturalinstitutions' url = 'https://data.cityofnewyork.us/api/views/u35m-9t32/rows.csv?accessType=DOWNLOAD' base_path = create_base_path(__file__) Flow( load(url, name=table_name, format='csv', force_strings=True), joined_lower(resources=table_name), dump_to_s3(resources=table_name, params=dict(base_path=base_path)) ).process()
def ETL(): table_name = 'nysdec_solidwaste' url = 'https://data.ny.gov/api/views/2fni-raj8/rows.csv?accessType=DOWNLOAD' base_path = create_base_path(__file__) Flow(load(url, name=table_name, format='csv', force_strings=True), joined_lower(resources=table_name), dump_to_s3(resources=table_name, params=dict(base_path=base_path))).process()
def ETL(data): table_name = 'nypl_libraries' base_path = create_base_path(__file__) Flow(data, set_type('lon', type='string'), set_type('lat', type='string'), update_resource(None, name=table_name), update_resource(resources=table_name, path=table_name + '.csv'), joined_lower(resources=table_name), dump_to_s3(resources=table_name, params=dict(base_path=base_path))).process()
def ETL(): table_name = 'nysdoh_healthfacilities' url = 'https://health.data.ny.gov/api/views/vn5v-hh5r/rows.csv?accessType=DOWNLOAD' base_path = create_base_path(__file__) Flow(load(url, name=table_name, format='csv', force_strings=True), joined_lower(resources=table_name), dump_to_s3(resources=table_name, params=dict(base_path=base_path))).process()
def ETL(): table_name = 'nysopwdd_providers' url = 'https://data.ny.gov/api/views/ieqx-cqyk/rows.csv?accessType=DOWNLOAD' base_path = create_base_path(__file__) Flow(load(url, name=table_name, format='csv', force_strings=True), joined_lower(resources=table_name), update_resource(resources=table_name, path=table_name + '.csv'), dump_to_s3(resources=table_name, params=dict(base_path=base_path))).process()
def ETL(): table_name = 'qpl_libraries' url = 'https://data.cityofnewyork.us/api/views/kh3d-xhq7/rows.csv?accessType=DOWNLOAD' base_path = create_base_path(__file__) Flow(load(url, name=table_name, format='csv', force_strings=True), update_resource(resources=table_name, path=table_name + '.csv'), joined_lower(resources=table_name), dump_to_s3(resources=table_name, params=dict(base_path=base_path))).process()
def ETL(data): table_name = 'foodbankny_foodbanks' base_path = create_base_path(__file__) Flow( # data, load(f'{str(Path(__file__).parent)}/foodbankny_foodbanks.csv'), update_resource(None, name=table_name), update_resource(resources=table_name, path=table_name + '.csv'), joined_lower(resources=table_name), dump_to_s3(resources=table_name, params=dict(base_path=base_path))).process()
def ETL(): table_name = 'facilities_classification' url='https://raw.githubusercontent.com/NYCPlanning/db-facilities-tmp/dev/referencetables/classification.csv' base_path = create_base_path(__file__) Flow( load(url, name=table_name, format='csv', force_strings=True), joined_lower(resources=table_name), dump_to_s3(resources=table_name, params=dict(base_path=base_path)) ).process()
def ETL(table_name): base_path = create_base_path(__file__) sourcePath = Path(__file__).parent file_path = [ filepath for filepath in Path(sourcePath / 'tmp').glob('**/*') if filepath.suffix == '.csv' ][0] Flow( load(str(file_path), name=table_name, format='csv', force_strings=True), joined_lower(resources=table_name), update_resource(None, name=table_name), update_resource(resources=table_name, path=table_name + '.csv'), dump_to_s3(resources=table_name, params=dict(base_path=base_path))).process()
def ETL(): import pprint base_path = create_base_path(__file__) Flow( Load('housing_input_hny_job_manual', 'latest'), # Load('housing_input_dcpattributes', 'latest'), # Load('housing_input_lookup_occupancy', 'latest'), Load('housing_input_removals', 'latest'), # Load('housing_input_lookup_status','latest'), # Load('dob_cofos','latest'), # Load('dob_jobapplications','latest'), # Load('dob_permitissuance','latest'), Load('hpd_hny_units_by_building', 'latest'), Load('hpd_hny_units_by_project', 'latest'), dump_2_s3(params=dict(base_path=base_path))).process()
def ETL(): table_name = 'usdot_airports' base_path = create_base_path(__file__) file_path = Path(__file__).parent / 'tmp' / 'usdot_airports.csv' Flow( load(str(file_path), name=table_name, format='csv', force_strings=True), joined_lower(resources=table_name), filter_rows(equals=[dict(state_name='NEW YORK')]), filter_rows(equals=[ dict(county='NEW YORK'), dict(county='BRONX'), dict(county='KINGS'), dict(county='QUEENS'), dict(county='RICHMOND') ]), dump_to_s3(resources=table_name, params=dict(base_path=base_path))).process()
def ETL(): table_name = 'nysed_activeinstitutions' base_path = create_base_path(__file__) file_path = Path(__file__).parent / 'nysed_activeinstitutions.csv' Flow( load(str(file_path), name=table_name, format='csv', force_strings=True), joined_lower(resources=table_name), rename_field('gis_longitute_(x)', 'gis_longitute_x'), rename_field('gis_latitude_(y)', 'gis_latitude_y'), rename_field( 'federal_information_processing_standard_(fips)_state_code', 'federal_information_processing_standard_fips_state_code'), rename_field( 'federal_information_processing_standard_(fips)_county_code', 'federal_information_processing_standard_fips_county_code'), dump_to_s3(resources=table_name, params=dict(base_path=base_path))).process()
def ETL(): import pprint base_path = create_base_path(__file__) Flow( Load('dcp_commercialoverlay', 'latest'), Load('dcp_limitedheight', 'latest'), Load('dcp_mih','latest'), Load('dof_dtm','latest'), Load('dcp_specialpurpose','latest'), Load('dcp_specialpurposesubdistricts','latest'), Load('dcp_zoningdistricts','latest'), Load('dcp_zoningmapamendments','latest'), Load('dcp_zoningtaxlots', 'latest'), Load('dcp_zoningmapindex', 'latest'), dump_2_s3(params=dict(base_path=base_path)) ).process()
def ETL(table_name): key = str(Path(create_base_path(__file__))/f'{table_name}.csv') file_path = str(Path(__file__).parent/'tmp'/f'{table_name}.csv') content_type, _ = mimetypes.guess_type(key) client = make_client() bucket = os.environ.get('BUCKET') config = TransferConfig(multipart_threshold=1024^2*100, max_concurrency=10, multipart_chunksize=1024^2*100, use_threads=True) beg_ts = time.time() client.upload_file( Filename=file_path, Bucket=bucket, Config = config, ExtraArgs={ 'ACL': 'public-read', 'ContentType': content_type or 'text/plain'}, Key=key) end_ts = time.time() print(f'dumped to {key}, elapsed time: {end_ts - beg_ts}')
def ETL(): import pprint base_path = create_base_path(__file__) Flow( Load('doe_universalprek', 'latest'), Load('hhc_hospitals', 'latest'), Load('dcla_culturalinstitutions', 'latest'), Load('nycha_policeservice', 'latest'), Load('dohmh_daycare', 'latest'), Load('dpr_parksproperties', 'latest'), Load('doe_busroutesgarages', 'latest'), Load('dcp_pops', 'latest'), Load('dcas_colp', 'latest'), Load('dfta_contracts', 'latest'), Load('dycd_afterschoolprograms', 'latest'), Load('sbs_workforce1', 'latest'), Load('nysdec_solidwaste', 'latest'), Load('nysomh_mentalhealth', 'latest'), Load('nysdoh_healthfacilities', 'latest'), Load('nysopwdd_providers', 'latest'), Load('usnps_parks', 'latest'), Load('dca_operatingbusinesses', 'latest'), Load('dep_wwtc', 'latest'), Load('foodbankny_foodbanks', 'latest'), Load('bpl_libraries', 'latest'), Load('qpl_libraries', 'latest'), Load('dsny_mtsgaragemaintenance', 'latest'), Load('doe_lcgms', 'latest'), Load('nysdoh_nursinghomes', 'latest'), Load('nysed_activeinstitutions', 'latest'), Load('usdot_airports', 'latest'), Load('nypl_libraries', 'latest'), Load('usdot_ports', 'latest'), Load('dot_mannedfacilities', 'latest'), Load('dot_bridgehouses', 'latest'), Load('dot_ferryterminals', 'latest'), Load('dot_publicparking', 'latest'), Load('hra_centers', 'latest'), Load('nysdec_lands', 'latest'), Load('nycha_communitycenters', 'latest'), Load('nysdec_lands', 'latest'), Load('nycha_communitycenters', 'latest'), Load('moeo_socialservicesiteloactions', 'latest'), Load('fbop_corrections', 'latest'), Load('nysparks_historicplaces', 'latest'), Load('uscourts_courts', 'latest'), Load('nysocfs_offices', 'latest'), Load('nysoasas_programs', 'latest'), Load('nysdoccs_corrections', 'latest'), Load('nycdoc_corrections', 'latest'), Load('dot_pedplazas', 'latest'), Load('nycourts_courts', 'latest'), Load('dcp_sfpsd', 'latest'), Load('nysparks_parks', 'latest'), Load('doe_bluebook', 'latest'), Load('acs_daycareheadstart', 'latest'), Load('fdny_firehouses', 'latest'), # # load geo boundaries # Load('doitt_buildingcentroids', 'latest'), # Load('dcp_boroboundaries_wi', 'latest'), # Load('dcp_cdboundaries', 'latest'), # Load('dcp_censustracts', 'latest'), # Load('dcp_councildistricts', 'latest'), # Load('dcp_ntaboundaries', 'latest'), # Load('dcp_policeprecincts', 'latest'), # Load('dcp_school_districts', 'latest'), # Load('doitt_zipcodeboundaries', 'latest'), dump_2_s3(params=dict(base_path=base_path))).process()