def ftp_and_prime_geocoder(job, **kwparameters): use_local_input_file = kwparameters[ 'use_local_input_file'] # Deserializing the command-line parameters # feels kludgy, but it also doesn't seem worth folding them into the Job object just for # the custom processing function and the run_pipeline function. if not use_local_input_file: fetch_city_file(job) # Geocoding Stuff areas = [ 'NEIGHBORHOOD', 'TRACT', 'COUNCIL_DISTRICT', 'PLI_DIVISION', 'POLICE_ZONE', 'FIRE_ZONE', 'PUBLIC_WORKS_DIVISION', 'WARD' ] parcel_file = job.local_directory + "parcel_areas.csv" with open(parcel_file) as f: dr = csv.DictReader(f) for row in dr: coords[row['PIN']] = {'x': row['x'], 'y': row['y']} for area in areas: coords[row['PIN']][area] = row[area]
def conditionally_get_city_files(job, **kwparameters): if not kwparameters['use_local_files']: fetch_city_file(job)
def preprocess(job, **kwparameters): import datetime, usaddress, re, csv, copy get_file_job = Job({ 'source_type': 'sftp', # This parameter is currently unneeded by fetch_city_file. 'source_dir': '', 'source_file': 'right_of_way_permits.csv', 'job_directory': 'right_of_way_backup', # This parameter is usually auto-added # in launchpad.py. }) fetch_city_file( get_file_job) # This is the first time a little joblet was created # to handle the file-fetching in the initial custom-processing script. # Eventually the manner for doing this stuff should be standardized... # Other scripts have broken such a mutli-step process into two processes at the # job_dict level, which is another option and is perhaps a little easier for # the reader to follow. It would also route around the INPUT_FILE/OUTPUT_FILE stuff # below. Really this alternate approach is a result of taking two existing scripts # and combining them into one rocket-etl-style script in a way that minimized # code changes and possible unanticipated bugs. print("Let the preprocessing begin...") INPUT_FILE = '/home/daw165/rocket-etl/source_files/right_of_way_backup/right_of_way_permits.csv' OUTPUT_FILE = '/home/daw165/rocket-etl/source_files/right_of_way_backup/right_of_way_transmuted_backup.csv' INPUT_HEADING = [ "id", 'display', "type.display", "openDate", "Date.From", "Date.To", "asis.Restoration.cDate", "description", "loc.address_full", "asis.From", "asis.From.c2", "asis.From.c3", "asis.From.c4", "asis.From.c5", "asis.From.c1", "asis.To", "asis.To.c2", "asis.To.c3", "asis.To.c4", "asis.To.c5", "asis.To.c1", "asis.Location", "asis.Location.c2", "asis.Location.c3", "asis.Location.c4", "asis.Location.c5", "asis.Location.c1", "prof.businessName", "prof.licenseType.value", "geo.x", "geo.y" ] OUTPUT_HEADING = [ 'display', "sequence", "type", "open_date", "from_date", "to_date", "restoration_date", "description", "full_address", "location", "from_street", "to_street", "business_name", "license_type", "latitude", "longitude", "from_lat", "from_lon", "to_lat", "to_lon" ] ROW_MAPPING = { 'display': 'display', 'type': 'type.display', 'open_date': 'openDate', 'from_date': 'Date.From', 'to_date': 'Date.To', 'restoration_date': 'asis.Restoration.cDate', 'description': 'description', 'full_address': 'loc.address_full', # 'location': '', # 'from': '', # 'to': '', 'business_name': 'prof.businessName', 'license_type': 'prof.licenseType.value', 'latitude': 'geo.y', 'longitude': 'geo.x' } def clean_street(addr): try: parts = usaddress.tag(addr)[0] pre = parts.get('StreetNamePreDirectional', '') name = parts.get('StreetName', '') post = parts.get('StreetNamePostType', '') return re.sub(r'\s+', ' ', '{} {} {}'.format(pre, name, post)).lstrip().rstrip() except: print(addr) input_f = open(INPUT_FILE) output_f = open(OUTPUT_FILE, 'w') reader = csv.DictReader(input_f) writer = csv.DictWriter(output_f, OUTPUT_HEADING) writer.writeheader() resource_id = "cc17ee69-b4c8-4b0c-8059-23af341c9214" # Production version of Right-of-Way Permits table published_ids = [ x['id'] for x in query_resource( site, 'SELECT DISTINCT id FROM "{}"'.format(resource_id), API_key) ] print(f"Found {len(published_ids)} extant, published 'id' values.") written = 0 open_dates = [] for row in reader: if datetime.datetime.strptime( row['openDate'], '%Y-%m-%d' ) >= datetime.datetime.now() - datetime.timedelta( days=6 ): # This condition has been reversed with respect to the original preprocess.py script # to catch the uncaught rows. print('too new', row['openDate']) continue # If the 'id' field value is already in the published data, skip this row. if row['display'] in published_ids: continue else: written += 1 new_row_base = {} for new_heading, old_heading in ROW_MAPPING.items(): new_row_base[new_heading] = row[old_heading] for i in range(6): if not i: suffix = '' # no appending number for 0th item else: suffix = '.c' + str(i) location = row['asis.Location{}'.format(suffix)] to_street = row['asis.To{}'.format(suffix)] from_street = row['asis.From{}'.format(suffix)] new_row = copy.deepcopy(new_row_base) new_row['sequence'] = str(i - 1) if i else 0 open_dates.append(row['openDate']) if location != 'NA': new_row['location'] = location new_row['from_street'] = from_street new_row['to_street'] = to_street new_row['from_lat'], new_row['from_lon'] = '', '' new_row['to_lat'], new_row['to_lon'] = '', '' location = clean_street(location) if from_street != 'NA': x, y = '', '' ##cross_geocode(location, from_street) new_row['from_lat'], new_row['from_lon'] = y, x #streets['{}--{}'.format(location, from_street)] = (x, y) #streets['{}--{}'.format(from_street, location)] = (x, y) if to_street != 'NA': x, y = '', '' ##cross_geocode(location, to_street) new_row['to_lat'], new_row['to_lon'] = y, x #streets['{}--{}'.format(location, to_street)] = (x, y) #streets['{}--{}'.format(to_street, location)] = (x, y) writer.writerow(new_row) elif not ( i ): # there is no location on the first attempt - must use address new_row['location'] = '' new_row['from_street'] = '' new_row['to_street'] = '' writer.writerow(new_row) msg = f"Found {written} rows which are overdue for writing to the Right-of-Way Permits table." if written > 0 and len(open_dates) > 0: msg += f" open_date values range from {min(open_dates)} to {max(open_dates)}. Preparing to push them to the CKAN table." print(msg) print(f"Wrote {written} rows to {OUTPUT_FILE}.") if not kwparameters['test_mode'] and written > 0: channel = "@david" #if (test_mode or not PRODUCTION) else "#etl-hell" send_to_slack(msg, username='******', channel=channel, icon=':illuminati:')