def regroup_dask(task_type: str, years: List[str], resample_freq: str, filter_key: str, filter_val: str) -> bool: try: # determine in and out buckets # and split_by from task type map in_bucket: str = task_map.task_type_map[task_type]['in'] out_bucket: str = task_map.task_type_map[task_type]['out'] split_by: List[str] = task_map.task_type_map[task_type]['split_by'] date_cols: List[str] = task_map.task_type_map[task_type]['date_cols'] dtypes: Dict = task_map.task_type_map[task_type]['dtypes'] print('fetched in out and split_by for task_type %(task)s' % {'task': task_type}) # read files from in bucket and concat into one df s3_options: Dict = ps.fetch_s3_options() client: Client = dask.create_dask_client(num_workers=8) # create out bucket ps.create_bucket(out_bucket) s3_in_url: str = 's3://' + in_bucket + '/' s3_sub_path: str = resample_freq + '/' + filter_key + filter_val + '/' if task_type == 'rg-tsfare': s3_sub_path = '' df = dd.concat([ dd.read_csv(urlpath=s3_in_url + year + '/' + s3_sub_path + '*', storage_options=s3_options, parse_dates=date_cols, dtype=dtypes) for year in years ]) print('read files from in bucket and concat-ted into one df') fillna_dict: Dict = {key: 0 for key in dtypes} df.fillna(fillna_dict) if task_type == 'rg-tsfare': s3_sub_path = resample_freq + '/' df.groupby(split_by).apply(partial(write_group_to_csv, split_by=split_by, out_bucket=out_bucket, out_path=s3_sub_path), meta=('int')).compute() except Exception as err: print('Error: %(error)s in regrouper for task_type %(task)s' % { 'error': err, 'task': task_type }) raise err return True
def make_traffic(*args) -> List[str]: map: Dict = task_map.task_type_map['rs-traffic'] out_bucket: str = map['out'] ps.create_bucket(out_bucket) return dl_tasks.make_traffic(*args)
from utils import persistence as ps from data_tools import task_map import sys from data_resample import tasks as rs_tasks from data_clean import tasks as cl_tasks from data_load import tasks as dl_tasks from typing import List if __name__ == '__main__': task_type: str = sys.argv[1] years: List[str] = sys.argv[2:] # call pipeline function with task_type status: bool = ps.create_bucket(task_map.task_type_map[task_type]['out']) #status = True if status: task_prefix: str = task_type.split('-', 1)[0] if task_prefix == 'rs': status = rs_tasks.perform_dask(task_type, years) elif task_type in ['cl-gcabs', 'cl-ycabs']: status = cl_tasks.perform_cabs_dask(task_type, years) #status = cl_tasks.perform_dask_test() elif task_type == 'cl-transit': status = cl_tasks.perform_transit_dask(task_type, years) #status = cl_tasks.perform_transit_dask_test() elif task_type == 'cl-traffic': status = cl_tasks.perform_traffic_dask(task_type, years) elif task_type in ['dl-gcabs', 'dl-ycabs']: status = dl_tasks.perform_cabs_dask(task_type, years) elif task_type == 'dl-transit': status = dl_tasks.perform_transit_dask(task_type, years)
def load_ref_files(*args) -> bool: for task in list(*args): print('loading ref files for %s' % task) if task in ['cabs', 'transit', 'traffic', 'gas', 'weather']: # create ref-base bucket ps.create_bucket(REFBASE_BUCKET) crs: Dict[str, str] = {'init': 'epsg:4326'} if task == 'cabs': # load taxi zone files taxi_zones_url: str = 'https://s3.amazonaws.com/nyc-tlc/misc/taxi_zones.zip' taxi_zones_file: Tuple = http.get_stream_from_url(taxi_zones_url) print('zip file response status %s' % taxi_zones_file[1].status) # unzip zip_path: str = '/tmp/cabs-ref-in/' zipfile: ZipFile = ZipFile(BytesIO(taxi_zones_file[1].read())) zipfile.extractall(zip_path) zipfile.close() # process taxi shapefile cabs_out_path: str = '/tmp/cabs-ref-out/' cabs_filename: str = 'taxi_zones.shp' taxi_zone_df: GeoDataFrame = read_file(zip_path + cabs_filename).to_crs(crs) taxi_zone_df.drop(['Shape_Area', 'Shape_Leng', 'OBJECTID', 'borough', 'zone'], axis=1, inplace=True) os.makedirs(cabs_out_path, exist_ok=True) taxi_zone_df.to_file(cabs_out_path+cabs_filename) taxi_zone_files: List[str] = glob.glob(cabs_out_path+'*') os.chdir(cabs_out_path) with ZipFile('taxi_zones.zip', 'w') as zipfile: for file in taxi_zone_files: zipfile.write(file.rsplit('/', 1)[1]) #ps.copy_files(dest_bucket=REFBASE_BUCKET, source_folder=cabs_out_path) ps.copy_file(dest_bucket=REFBASE_BUCKET, source=cabs_out_path+'taxi_zones.zip', file='taxi_zones.zip') elif task == 'transit': # load station file stations_url: str = 'http://web.mta.info/developers/data/nyct/subway/Stations.csv' usecols: List[str] = ['Station ID', 'GTFS Stop ID', 'Stop Name', 'Borough', 'GTFS Latitude', 'GTFS Longitude'] stations_df: pd.DataFrame = pd.read_csv(stations_url, header=0, usecols=usecols, encoding='utf-8') stations_df.rename(columns={'Station ID': 'station_id', 'GTFS Stop ID': 'stop_id', 'Stop Name': 'stop_name', 'Borough': 'borough', 'GTFS Latitude': 'latitude', 'GTFS Longitude': 'longitude'}, inplace=True) stations_df.drop_duplicates(inplace=True) stations_df.dropna(inplace=True) # add fuzzy station name from turnstile data stations_df = add_fuzzy_station(df=stations_df) geometry: List[Point] = [Point(xy) for xy in zip(stations_df.longitude, stations_df.latitude)] stations_df.drop(['latitude', 'longitude'], axis=1, inplace=True) stations_geodf: GeoDataFrame = GeoDataFrame(stations_df, crs=crs, geometry=geometry) stations_out_path: str = '/tmp/transit-ref-out/' os.makedirs(stations_out_path, exist_ok=True) stations_filename: str = 'stations.shp' stations_geodf.to_file(stations_out_path+stations_filename) station_files: List[str] = glob.glob(stations_out_path+'*') os.chdir(stations_out_path) with ZipFile('stations.zip', 'w') as zipfile: for file in station_files: zipfile.write(file.rsplit('/', 1)[1]) #ps.copy_files(dest_bucket=REFBASE_BUCKET, source_folder=stations_out_path) ps.copy_file(dest_bucket=REFBASE_BUCKET, source=stations_out_path+'stations.zip', file='stations.zip') elif task == 'traffic': # load traffic links file links_url: str = 'http://data.beta.nyc//dataset/e8facf61-2bb1-49e0-9128-5a8797b214c8/resource/1384aa3a-b7e2-4c28-9b5e-2808a07a7193/download/linkinfo.csv' cols: List[int] = [0, 1] names: List[str] = ['linkid', 'link'] converters: Dict[str, Callable] = { 'linkid': row_ops.clean_num } links_df: pd.DataFrame = pd.read_csv(links_url, header=None, usecols=cols, names=names, converters=converters, encoding='utf-8') links_df.drop_duplicates(inplace=True) links_df.dropna(inplace=True) geometry = [LineString(build_coord_tuples(x)) for x in links_df.link] links_geodf = GeoDataFrame(links_df.drop('link', axis=1), crs=crs, geometry=geometry) links_out_path: str = '/tmp/traffic-ref-out/' os.makedirs(links_out_path, exist_ok=True) links_filename: str = 'traffic_links.shp' links_geodf.to_file(links_out_path+links_filename) links_files: List[str] = glob.glob(links_out_path+'*') os.chdir(links_out_path) with ZipFile('traffic_links.zip', 'w') as zipfile: for file in links_files: zipfile.write(file.rsplit('/', 1)[1]) ps.copy_file(dest_bucket=REFBASE_BUCKET, source=links_out_path+'traffic_links.zip', file='traffic_links.zip') elif task == 'gas': # load gas data file filename: str ='gas.csv' cols = [0, 1] names = ['date', 'price'] converters = { 'price': row_ops.clean_num } gas_df: pd.DataFrame = pd.read_csv(ps.get_file_stream(bucket=OTHERS_BUCKET, filename=filename), header=None, usecols=cols, parse_dates=['date'], skiprows=2, names=names, converters=converters, encoding='utf-8') file_io.write_csv(df=gas_df, bucket=REFBASE_BUCKET, filename=filename) elif task == 'weather': # load gas data file filename ='weather.csv' cols = [5, 8, 9, 12, 13] names = ['date', 'prcp', 'snow', 'tmax', 'tmin'] converters = { 'prcp': row_ops.clean_num, 'snow': row_ops.clean_num, 'tmax': row_ops.clean_num, 'tmin': row_ops.clean_num } weather_df: pd.DataFrame = pd.read_csv(ps.get_file_stream(bucket=OTHERS_BUCKET, filename=filename), header=None, usecols=cols, parse_dates=['date'], skiprows=1, names=names, converters=converters, encoding='utf-8') weather_df['temp'] = (weather_df['tmax']+weather_df['tmin'])/2 weather_df = weather_df.drop(columns=['tmax', 'tmin']) file_io.write_csv(df=weather_df, bucket=REFBASE_BUCKET, filename=filename) else: print('unrecognized ref-base load task %s' % task) raise errors.TaskTypeError('ref-base load '+task) return True
def make_ycabs(*args) -> List[str]: map: Dict = task_map.task_type_map['cl-ycabs'] out_bucket: str = map['out'] ps.create_bucket(out_bucket) return dl_tasks.make_ycabs(*args)