예제 #1
0
def ftp_and_prime_geocoder(job, **kwparameters):
    use_local_input_file = kwparameters[
        'use_local_input_file']  # Deserializing the command-line parameters
    # feels kludgy, but it also doesn't seem worth folding them into the Job object just for
    # the custom processing function and the run_pipeline function.
    if not use_local_input_file:
        fetch_city_file(job)
    # Geocoding Stuff
    areas = [
        'NEIGHBORHOOD', 'TRACT', 'COUNCIL_DISTRICT', 'PLI_DIVISION',
        'POLICE_ZONE', 'FIRE_ZONE', 'PUBLIC_WORKS_DIVISION', 'WARD'
    ]

    parcel_file = job.local_directory + "parcel_areas.csv"

    with open(parcel_file) as f:
        dr = csv.DictReader(f)
        for row in dr:
            coords[row['PIN']] = {'x': row['x'], 'y': row['y']}
            for area in areas:
                coords[row['PIN']][area] = row[area]
예제 #2
0
def conditionally_get_city_files(job, **kwparameters):
    if not kwparameters['use_local_files']:
        fetch_city_file(job)
예제 #3
0
def preprocess(job, **kwparameters):
    import datetime, usaddress, re, csv, copy

    get_file_job = Job({
        'source_type':
        'sftp',  # This parameter is currently unneeded by fetch_city_file.
        'source_dir': '',
        'source_file': 'right_of_way_permits.csv',
        'job_directory':
        'right_of_way_backup',  # This parameter is usually auto-added
        # in launchpad.py.
    })
    fetch_city_file(
        get_file_job)  # This is the first time a little joblet was created
    # to handle the file-fetching in the initial custom-processing script.
    # Eventually the manner for doing this stuff should be standardized...
    # Other scripts have broken such a mutli-step process into two processes at the
    # job_dict level, which is another option and is perhaps a little easier for
    # the reader to follow. It would also route around the INPUT_FILE/OUTPUT_FILE stuff
    # below. Really this alternate approach is a result of taking two existing scripts
    # and combining them into one rocket-etl-style script in a way that minimized
    # code changes and possible unanticipated bugs.

    print("Let the preprocessing begin...")
    INPUT_FILE = '/home/daw165/rocket-etl/source_files/right_of_way_backup/right_of_way_permits.csv'
    OUTPUT_FILE = '/home/daw165/rocket-etl/source_files/right_of_way_backup/right_of_way_transmuted_backup.csv'
    INPUT_HEADING = [
        "id", 'display', "type.display", "openDate", "Date.From", "Date.To",
        "asis.Restoration.cDate", "description", "loc.address_full",
        "asis.From", "asis.From.c2", "asis.From.c3", "asis.From.c4",
        "asis.From.c5", "asis.From.c1", "asis.To", "asis.To.c2", "asis.To.c3",
        "asis.To.c4", "asis.To.c5", "asis.To.c1", "asis.Location",
        "asis.Location.c2", "asis.Location.c3", "asis.Location.c4",
        "asis.Location.c5", "asis.Location.c1", "prof.businessName",
        "prof.licenseType.value", "geo.x", "geo.y"
    ]

    OUTPUT_HEADING = [
        'display', "sequence", "type", "open_date", "from_date", "to_date",
        "restoration_date", "description", "full_address", "location",
        "from_street", "to_street", "business_name", "license_type",
        "latitude", "longitude", "from_lat", "from_lon", "to_lat", "to_lon"
    ]

    ROW_MAPPING = {
        'display': 'display',
        'type': 'type.display',
        'open_date': 'openDate',
        'from_date': 'Date.From',
        'to_date': 'Date.To',
        'restoration_date': 'asis.Restoration.cDate',
        'description': 'description',
        'full_address': 'loc.address_full',
        # 'location': '',
        # 'from': '',
        # 'to': '',
        'business_name': 'prof.businessName',
        'license_type': 'prof.licenseType.value',
        'latitude': 'geo.y',
        'longitude': 'geo.x'
    }

    def clean_street(addr):
        try:
            parts = usaddress.tag(addr)[0]
            pre = parts.get('StreetNamePreDirectional', '')
            name = parts.get('StreetName', '')
            post = parts.get('StreetNamePostType', '')
            return re.sub(r'\s+', ' ',
                          '{} {} {}'.format(pre, name,
                                            post)).lstrip().rstrip()
        except:
            print(addr)

    input_f = open(INPUT_FILE)
    output_f = open(OUTPUT_FILE, 'w')

    reader = csv.DictReader(input_f)
    writer = csv.DictWriter(output_f, OUTPUT_HEADING)

    writer.writeheader()

    resource_id = "cc17ee69-b4c8-4b0c-8059-23af341c9214"  # Production version of Right-of-Way Permits table
    published_ids = [
        x['id'] for x in query_resource(
            site, 'SELECT DISTINCT id FROM "{}"'.format(resource_id), API_key)
    ]
    print(f"Found {len(published_ids)} extant, published 'id' values.")

    written = 0
    open_dates = []
    for row in reader:
        if datetime.datetime.strptime(
                row['openDate'], '%Y-%m-%d'
        ) >= datetime.datetime.now() - datetime.timedelta(
                days=6
        ):  # This condition has been reversed with respect to the original preprocess.py script
            # to catch the uncaught rows.
            print('too new', row['openDate'])
            continue

        # If the 'id' field value is already in the published data, skip this row.
        if row['display'] in published_ids:
            continue
        else:
            written += 1

        new_row_base = {}

        for new_heading, old_heading in ROW_MAPPING.items():
            new_row_base[new_heading] = row[old_heading]

        for i in range(6):
            if not i:
                suffix = ''  # no appending number for 0th item
            else:
                suffix = '.c' + str(i)

            location = row['asis.Location{}'.format(suffix)]
            to_street = row['asis.To{}'.format(suffix)]
            from_street = row['asis.From{}'.format(suffix)]

            new_row = copy.deepcopy(new_row_base)
            new_row['sequence'] = str(i - 1) if i else 0

            open_dates.append(row['openDate'])

            if location != 'NA':
                new_row['location'] = location
                new_row['from_street'] = from_street
                new_row['to_street'] = to_street
                new_row['from_lat'], new_row['from_lon'] = '', ''
                new_row['to_lat'], new_row['to_lon'] = '', ''
                location = clean_street(location)
                if from_street != 'NA':
                    x, y = '', ''  ##cross_geocode(location, from_street)
                    new_row['from_lat'], new_row['from_lon'] = y, x
                    #streets['{}--{}'.format(location, from_street)] = (x, y)
                    #streets['{}--{}'.format(from_street, location)] = (x, y)

                if to_street != 'NA':
                    x, y = '', ''  ##cross_geocode(location, to_street)
                    new_row['to_lat'], new_row['to_lon'] = y, x
                    #streets['{}--{}'.format(location, to_street)] = (x, y)
                    #streets['{}--{}'.format(to_street, location)] = (x, y)

                writer.writerow(new_row)

            elif not (
                    i
            ):  # there is no location on the first attempt - must use address
                new_row['location'] = ''
                new_row['from_street'] = ''
                new_row['to_street'] = ''
                writer.writerow(new_row)

    msg = f"Found {written} rows which are overdue for writing to the Right-of-Way Permits table."
    if written > 0 and len(open_dates) > 0:
        msg += f" open_date values range from {min(open_dates)} to {max(open_dates)}. Preparing to push them to the CKAN table."
    print(msg)
    print(f"Wrote {written} rows to {OUTPUT_FILE}.")
    if not kwparameters['test_mode'] and written > 0:
        channel = "@david"  #if (test_mode or not PRODUCTION) else "#etl-hell"
        send_to_slack(msg,
                      username='******',
                      channel=channel,
                      icon=':illuminati:')