示例#1
0
def regroup_dask(task_type: str, years: List[str], resample_freq: str,
                 filter_key: str, filter_val: str) -> bool:
    try:
        # determine in and out buckets
        # and split_by from task type map
        in_bucket: str = task_map.task_type_map[task_type]['in']
        out_bucket: str = task_map.task_type_map[task_type]['out']
        split_by: List[str] = task_map.task_type_map[task_type]['split_by']
        date_cols: List[str] = task_map.task_type_map[task_type]['date_cols']
        dtypes: Dict = task_map.task_type_map[task_type]['dtypes']
        print('fetched in out and split_by for task_type %(task)s' %
              {'task': task_type})

        # read files from in bucket and concat into one df
        s3_options: Dict = ps.fetch_s3_options()
        client: Client = dask.create_dask_client(num_workers=8)

        # create out bucket
        ps.create_bucket(out_bucket)

        s3_in_url: str = 's3://' + in_bucket + '/'
        s3_sub_path: str = resample_freq + '/' + filter_key + filter_val + '/'
        if task_type == 'rg-tsfare':
            s3_sub_path = ''
        df = dd.concat([
            dd.read_csv(urlpath=s3_in_url + year + '/' + s3_sub_path + '*',
                        storage_options=s3_options,
                        parse_dates=date_cols,
                        dtype=dtypes) for year in years
        ])

        print('read files from in bucket and concat-ted into one df')
        fillna_dict: Dict = {key: 0 for key in dtypes}
        df.fillna(fillna_dict)
        if task_type == 'rg-tsfare':
            s3_sub_path = resample_freq + '/'
        df.groupby(split_by).apply(partial(write_group_to_csv,
                                           split_by=split_by,
                                           out_bucket=out_bucket,
                                           out_path=s3_sub_path),
                                   meta=('int')).compute()

    except Exception as err:
        print('Error: %(error)s in regrouper for task_type %(task)s' % {
            'error': err,
            'task': task_type
        })
        raise err

    return True
示例#2
0
def make_traffic(*args) -> List[str]:
    map: Dict = task_map.task_type_map['rs-traffic']
    out_bucket: str = map['out']
    ps.create_bucket(out_bucket)
    return dl_tasks.make_traffic(*args)
示例#3
0
from utils import persistence as ps
from data_tools import task_map
import sys
from data_resample import tasks as rs_tasks
from data_clean import tasks as cl_tasks
from data_load import tasks as dl_tasks
from typing import List

if __name__ == '__main__':
    task_type: str = sys.argv[1]
    years: List[str] = sys.argv[2:]
    # call pipeline function with task_type
    status: bool = ps.create_bucket(task_map.task_type_map[task_type]['out'])
    #status = True
    if status:

        task_prefix: str = task_type.split('-', 1)[0]
        if task_prefix == 'rs':
            status = rs_tasks.perform_dask(task_type, years)
        elif task_type in ['cl-gcabs', 'cl-ycabs']:
            status = cl_tasks.perform_cabs_dask(task_type, years)
            #status = cl_tasks.perform_dask_test()
        elif task_type == 'cl-transit':
            status = cl_tasks.perform_transit_dask(task_type, years)
            #status = cl_tasks.perform_transit_dask_test()
        elif task_type == 'cl-traffic':
            status = cl_tasks.perform_traffic_dask(task_type, years)
        elif task_type in ['dl-gcabs', 'dl-ycabs']:
            status = dl_tasks.perform_cabs_dask(task_type, years)
        elif task_type == 'dl-transit':
            status = dl_tasks.perform_transit_dask(task_type, years)
示例#4
0
def load_ref_files(*args) -> bool:
    for task in list(*args):
        print('loading ref files for %s' % task)

        if task in ['cabs', 'transit', 'traffic', 'gas', 'weather']:
            # create ref-base bucket
            ps.create_bucket(REFBASE_BUCKET)
            crs: Dict[str, str] = {'init': 'epsg:4326'}
            if task == 'cabs':
                # load taxi zone files
                taxi_zones_url: str = 'https://s3.amazonaws.com/nyc-tlc/misc/taxi_zones.zip'
                taxi_zones_file: Tuple = http.get_stream_from_url(taxi_zones_url)
                print('zip file response status %s' % taxi_zones_file[1].status)
                # unzip
                zip_path: str = '/tmp/cabs-ref-in/'
                zipfile: ZipFile = ZipFile(BytesIO(taxi_zones_file[1].read()))
                zipfile.extractall(zip_path)
                zipfile.close()

                # process taxi shapefile
                cabs_out_path: str = '/tmp/cabs-ref-out/'
                cabs_filename: str = 'taxi_zones.shp'
                taxi_zone_df: GeoDataFrame = read_file(zip_path + cabs_filename).to_crs(crs)
                taxi_zone_df.drop(['Shape_Area', 'Shape_Leng', 'OBJECTID', 'borough', 'zone'],
                                  axis=1, inplace=True)
                os.makedirs(cabs_out_path, exist_ok=True)
                taxi_zone_df.to_file(cabs_out_path+cabs_filename)
                taxi_zone_files: List[str] = glob.glob(cabs_out_path+'*')
                os.chdir(cabs_out_path)
                with ZipFile('taxi_zones.zip', 'w') as zipfile:
                    for file in taxi_zone_files:
                        zipfile.write(file.rsplit('/', 1)[1])
                #ps.copy_files(dest_bucket=REFBASE_BUCKET, source_folder=cabs_out_path)
                ps.copy_file(dest_bucket=REFBASE_BUCKET, source=cabs_out_path+'taxi_zones.zip', file='taxi_zones.zip')

            elif task == 'transit':
                # load station file
                stations_url: str = 'http://web.mta.info/developers/data/nyct/subway/Stations.csv'
                usecols: List[str] = ['Station ID', 'GTFS Stop ID', 'Stop Name', 'Borough',
                                      'GTFS Latitude', 'GTFS Longitude']
                stations_df: pd.DataFrame = pd.read_csv(stations_url, header=0, usecols=usecols,
                                                        encoding='utf-8')
                stations_df.rename(columns={'Station ID': 'station_id', 'GTFS Stop ID': 'stop_id',
                                            'Stop Name': 'stop_name', 'Borough': 'borough',
                                            'GTFS Latitude': 'latitude', 'GTFS Longitude': 'longitude'},
                                   inplace=True)

                stations_df.drop_duplicates(inplace=True)
                stations_df.dropna(inplace=True)

                # add fuzzy station name from turnstile data
                stations_df = add_fuzzy_station(df=stations_df)

                geometry: List[Point] = [Point(xy) for xy in zip(stations_df.longitude, stations_df.latitude)]
                stations_df.drop(['latitude', 'longitude'], axis=1, inplace=True)
                stations_geodf: GeoDataFrame = GeoDataFrame(stations_df, crs=crs, geometry=geometry)
                stations_out_path: str = '/tmp/transit-ref-out/'
                os.makedirs(stations_out_path, exist_ok=True)
                stations_filename: str = 'stations.shp'
                stations_geodf.to_file(stations_out_path+stations_filename)
                station_files: List[str] = glob.glob(stations_out_path+'*')
                os.chdir(stations_out_path)
                with ZipFile('stations.zip', 'w') as zipfile:
                    for file in station_files:
                        zipfile.write(file.rsplit('/', 1)[1])
                #ps.copy_files(dest_bucket=REFBASE_BUCKET, source_folder=stations_out_path)
                ps.copy_file(dest_bucket=REFBASE_BUCKET, source=stations_out_path+'stations.zip', file='stations.zip')

            elif task == 'traffic':
                # load traffic links file
                links_url: str = 'http://data.beta.nyc//dataset/e8facf61-2bb1-49e0-9128-5a8797b214c8/resource/1384aa3a-b7e2-4c28-9b5e-2808a07a7193/download/linkinfo.csv'
                cols: List[int] = [0, 1]
                names: List[str] = ['linkid', 'link']
                converters: Dict[str, Callable] = {
                                        'linkid': row_ops.clean_num
                                        }
                links_df: pd.DataFrame = pd.read_csv(links_url,
                                                        header=None,
                                                        usecols=cols,
                                                        names=names,
                                                        converters=converters,
                                                        encoding='utf-8')

                links_df.drop_duplicates(inplace=True)
                links_df.dropna(inplace=True)

                geometry = [LineString(build_coord_tuples(x)) for x in links_df.link]
                links_geodf = GeoDataFrame(links_df.drop('link', axis=1),
                                           crs=crs,
                                           geometry=geometry)

                links_out_path: str = '/tmp/traffic-ref-out/'
                os.makedirs(links_out_path, exist_ok=True)
                links_filename: str = 'traffic_links.shp'
                links_geodf.to_file(links_out_path+links_filename)
                links_files: List[str] = glob.glob(links_out_path+'*')
                os.chdir(links_out_path)
                with ZipFile('traffic_links.zip', 'w') as zipfile:
                    for file in links_files:
                        zipfile.write(file.rsplit('/', 1)[1])
                ps.copy_file(dest_bucket=REFBASE_BUCKET, source=links_out_path+'traffic_links.zip', file='traffic_links.zip')

            elif task == 'gas':
                # load gas data file
                filename: str ='gas.csv'
                cols = [0, 1]
                names = ['date', 'price']
                converters = {
                                        'price': row_ops.clean_num
                                        }
                gas_df: pd.DataFrame = pd.read_csv(ps.get_file_stream(bucket=OTHERS_BUCKET, filename=filename),
                                                        header=None,
                                                        usecols=cols,
                                                        parse_dates=['date'],
                                                        skiprows=2,
                                                        names=names,
                                                        converters=converters,
                                                        encoding='utf-8')

                file_io.write_csv(df=gas_df, bucket=REFBASE_BUCKET, filename=filename)

            elif task == 'weather':
                # load gas data file
                filename ='weather.csv'
                cols = [5, 8, 9, 12, 13]
                names = ['date', 'prcp', 'snow', 'tmax', 'tmin']
                converters = {
                                        'prcp': row_ops.clean_num,
                                        'snow': row_ops.clean_num,
                                        'tmax': row_ops.clean_num,
                                        'tmin': row_ops.clean_num
                                        }
                weather_df: pd.DataFrame = pd.read_csv(ps.get_file_stream(bucket=OTHERS_BUCKET, filename=filename),
                                                        header=None,
                                                        usecols=cols,
                                                        parse_dates=['date'],
                                                        skiprows=1,
                                                        names=names,
                                                        converters=converters,
                                                        encoding='utf-8')
                weather_df['temp'] = (weather_df['tmax']+weather_df['tmin'])/2
                weather_df = weather_df.drop(columns=['tmax', 'tmin'])
                file_io.write_csv(df=weather_df, bucket=REFBASE_BUCKET, filename=filename)

        else:
            print('unrecognized ref-base load task %s' % task)
            raise errors.TaskTypeError('ref-base load '+task)
    return True
示例#5
0
def make_ycabs(*args) -> List[str]:
    map: Dict = task_map.task_type_map['cl-ycabs']
    out_bucket: str = map['out']
    ps.create_bucket(out_bucket)
    return dl_tasks.make_ycabs(*args)