示例#1
0
def make_plots(buffer_radius_miles: float,
               stations_geodf: GeoDataFrame,
               taxi_zone_df: GeoDataFrame,
               links_df: GeoDataFrame,
               annotate: bool = False,
               plot_path: str = None) -> bool:

    sns.set(font_scale=.65)
    sns.set_style('white')
    plt.close('all')
    # create plots
    fig, ax = plt.subplots(1, figsize=(18, 10), clear=True)
    # taxi zones plot
    taxi_zone_df.plot(ax=ax,
                      facecolor='#F9DA95',
                      edgecolor='#FFFFFF',
                      linewidth=0.5)

    stations_geodf.plot(ax=ax,
                        facecolor='#618A98',
                        edgecolor='#618A98',
                        alpha=0.2)
    stations_points_geodf = stations_geodf.copy().set_geometry('point').drop(
        columns=['circle'])
    stations_points_geodf.plot(ax=ax, color='#787064', markersize=.5)
    if annotate:
        stations_points_geodf.apply(
            lambda x: plt.annotate(text=x['tsstation'],
                                   xy=x['point'].coords[0],
                                   horizontalalignment='center'),
            axis=1)
    links_df.plot(ax=ax, color='#AE4B16', linewidth=0.5)

    fig.text(.5,
             .05,
             'NYC stations with circles-of-influence (radius ' +
             str(buffer_radius_miles) + ' miles)',
             ha='center')
    ax.set_axis_off()
    plt.axis('equal')

    # save plots
    plt.show()
    plotfilepath: str = '/tmp/'
    plotfilename: str = 'geomerged' + str(buffer_radius_miles) + EXT
    remotefilename: str = plotfilename
    plt.savefig(plotfilepath + plotfilename)
    if plot_path is not None:
        remotefilename = plot_path + plotfilename
    status: bool = ps.copy_file(dest_bucket=PLOTS_BUCKET,
                                file=remotefilename,
                                source=plotfilepath + plotfilename)

    return status
示例#2
0
def perform_traffic(b_task: bytes) -> bool:
    block_number: int = int(str(b_task, 'utf-8'))
    #url: str = "https://data.cityofnewyork.us/api/views/i4gi-tjb9/rows.csv?accessType=DOWNLOAD&bom=true&query=select+*"
    url: str = "https://data.bloomington.in.gov/dataset/117733fb-31cb-480a-8b30-fbf425a690cd/resource/d5ba88f9-5798-46cd-888a-189eb59f7b46/download/traffic-counts2013-2015.csv"
    content_length: int = http.get_content_length(url)
    total_blocks: int = 20
    chunks_per_block = 5
    chunk_size = content_length // (total_blocks * chunks_per_block)
    print('content length is %(length)i and chunk size is %(cs)i' % {
        'length': content_length,
        'cs': chunk_size
    })
    source_folder: str = os.path.dirname(__file__) + '/traffic/'
    os.makedirs(source_folder, exist_ok=True)
    print('created source folder ' + source_folder)
    status: bool = False
    start_chunk: int = (block_number - 1) * chunks_per_block + 1
    start_byte: int = (start_chunk - 1) * chunk_size
    end_chunk: int = start_chunk + chunks_per_block + 1
    last_chunk_in_file = total_blocks * chunks_per_block
    try:
        for i in range(start_chunk, end_chunk):
            end_byte: int = start_byte + chunk_size - 1
            byte_range: str = 'bytes='
            if not (end_chunk == last_chunk_in_file):
                byte_range = byte_range + '%(start)i-%(end)i' % {
                    'start': start_byte,
                    'end': end_byte
                }
            else:
                byte_range = byte_range + '%(start)i-' % {'start': start_byte}
            print('downloading file from ' + url + ' for byte range ' +
                  byte_range)
            filename: str = http.download_chunk_from_url(
                url=url,
                folder=source_folder,
                byte_range=byte_range,
                filename='traffic_speed.part' + str(i))
            print('copying file ' + filename + ' to bucket traffic')
            status = ps.copy_file(dest_bucket='traffic',
                                  file=filename,
                                  source=source_folder + filename)
            start_byte = end_byte + 1

    except Exception as err:
        raise err
    else:
        return status
示例#3
0
def perform_cabs(cab_type: str, b_task: bytes) -> bool:
    bucket: str
    if cab_type == 'green':
        file_suffix = 'green'
        bucket = 'gcabs'
    elif cab_type == 'yellow':
        file_suffix = 'yellow'
        bucket = 'ycabs'
    task: str = str(b_task, 'utf-8')
    task_split: List[str] = task.split('-')
    year: str = task_split[0]
    urls: List[str]
    if cab_type == 'green':
        quarter: int = int(task_split[1])
        months = lambda quarter: range((quarter - 1) * 3 + 1,
                                       (quarter - 1) * 3 + 4)
        get_url = lambda month: 'https://s3.amazonaws.com/nyc-tlc/trip+data/' + file_suffix + '_tripdata_' + year + '-' + prefix_zero(
            month) + '.csv'
        urls = list(map(get_url, months(quarter)))
    elif cab_type == 'yellow':
        month: int = int(task_split[1])
        #months = lambda bimonth: range( (bimonth-1)*2+1, (bimonth-1)*2+3 )
        #get_url = lambda month: 'https://s3.amazonaws.com/nyc-tlc/trip+data/'+file_suffix+'_tripdata_'+year+'-'+prefix_zero(month)+'.csv'
        #urls = list(map(get_url, months(bimonth)))
        urls = [
            'https://s3.amazonaws.com/nyc-tlc/trip+data/' + file_suffix +
            '_tripdata_' + year + '-' + prefix_zero(month) + '.csv'
        ]

    print('downloading from urls ' + str(urls))
    source_folder: str = os.path.dirname(__file__) + '/' + bucket + '/'
    os.makedirs(source_folder, exist_ok=True)
    print('created source folder ' + source_folder)
    status: bool = False
    try:
        for url in urls:
            print('downloading file from ' + url)
            filename: str = http.download_from_url(url, source_folder)
            print('copying file ' + filename + ' to bucket ' + bucket)
            status = ps.copy_file(dest_bucket=bucket,
                                  file=filename,
                                  source=source_folder + filename)

    except Exception as err:
        raise err
    else:
        return status
示例#4
0
def perform_tsfare(b_task: bytes) -> bool:
    task: str = str(b_task, 'utf-8')
    task_split: List[str] = task.split('-')
    year: str = task_split[0]
    month: int = int(task_split[1])
    url_part1: str = "http://web.mta.info/developers/data/nyct/fares/fares_" + year + prefix_zero(
        month)
    url_part2: str = ".csv"
    #urls: List[str] = [url_part1+prefix_zero(day)+url_part2 for day in range(1, 32)]
    print('downloading from transit fare urls ')
    source_folder: str = os.path.dirname(__file__) + '/tsfare/'
    os.makedirs(source_folder, exist_ok=True)
    print('created source folder ' + source_folder)
    status: bool = False
    td: Timedelta = Timedelta(14, unit='d')
    try:
        for day in range(1, 32):
            url = url_part1 + prefix_zero(day) + url_part2
            print('downloading file from ' + url)
            try:
                filename: str = http.download_from_url(url, source_folder)
            except u_err.HTTPError as err:
                # ignore bad urls
                if err.code == 404:
                    print('ignoring bad transit fare url ' + url)
                    # do not attempt to copy file to minio
                    continue
                else:
                    raise err

            except Exception as err:
                raise err
            df = read_csv(source_folder + filename, skiprows=2)
            date: str = prefix_zero(month) + '/' + prefix_zero(
                day) + '/20' + year
            df['date'] = to_datetime(date, format='%m/%d/%Y') - td
            df.to_csv(source_folder + filename)
            print('copying file ' + filename + ' to bucket tsfare')
            status = ps.copy_file(dest_bucket='tsfare',
                                  file=filename,
                                  source=source_folder + filename)

    except Exception as err:
        raise err
    else:
        return status
示例#5
0
def perform_transit(b_task: bytes) -> bool:
    task: str = str(b_task, 'utf-8')
    task_split: List[str] = task.split('-')
    year: str = task_split[0]
    month: int = int(task_split[1])
    url_part1: str = "http://web.mta.info/developers/data/nyct/turnstile/turnstile_" + year + prefix_zero(
        month)
    url_part2: str = ".txt"
    urls: List[str] = [
        url_part1 + prefix_zero(day) + url_part2 for day in range(1, 32)
    ]
    print('downloading from transit urls ')
    source_folder: str = os.path.dirname(__file__) + '/transit/'
    os.makedirs(source_folder, exist_ok=True)
    print('created source folder ' + source_folder)
    status: bool = False
    try:
        for url in urls:
            print('downloading file from ' + url)
            try:
                filename: str = http.download_from_url(url, source_folder)
            except u_err.HTTPError as err:
                # ignore bad urls
                if err.code == 404:
                    print('ignoring bad transit url ' + url)
                    # do not attempt to copy file to minio
                    continue
                else:
                    raise err

            except Exception as err:
                raise err

            print('copying file ' + filename + ' to bucket transit')
            status = ps.copy_file(dest_bucket='transit',
                                  file=filename,
                                  source=source_folder + filename)

    except Exception as err:
        raise err
    else:
        return status
示例#6
0
def load_ref_files(*args) -> bool:
    for task in list(*args):
        print('loading ref files for %s' % task)

        if task in ['cabs', 'transit', 'traffic', 'gas', 'weather']:
            # create ref-base bucket
            ps.create_bucket(REFBASE_BUCKET)
            crs: Dict[str, str] = {'init': 'epsg:4326'}
            if task == 'cabs':
                # load taxi zone files
                taxi_zones_url: str = 'https://s3.amazonaws.com/nyc-tlc/misc/taxi_zones.zip'
                taxi_zones_file: Tuple = http.get_stream_from_url(taxi_zones_url)
                print('zip file response status %s' % taxi_zones_file[1].status)
                # unzip
                zip_path: str = '/tmp/cabs-ref-in/'
                zipfile: ZipFile = ZipFile(BytesIO(taxi_zones_file[1].read()))
                zipfile.extractall(zip_path)
                zipfile.close()

                # process taxi shapefile
                cabs_out_path: str = '/tmp/cabs-ref-out/'
                cabs_filename: str = 'taxi_zones.shp'
                taxi_zone_df: GeoDataFrame = read_file(zip_path + cabs_filename).to_crs(crs)
                taxi_zone_df.drop(['Shape_Area', 'Shape_Leng', 'OBJECTID', 'borough', 'zone'],
                                  axis=1, inplace=True)
                os.makedirs(cabs_out_path, exist_ok=True)
                taxi_zone_df.to_file(cabs_out_path+cabs_filename)
                taxi_zone_files: List[str] = glob.glob(cabs_out_path+'*')
                os.chdir(cabs_out_path)
                with ZipFile('taxi_zones.zip', 'w') as zipfile:
                    for file in taxi_zone_files:
                        zipfile.write(file.rsplit('/', 1)[1])
                #ps.copy_files(dest_bucket=REFBASE_BUCKET, source_folder=cabs_out_path)
                ps.copy_file(dest_bucket=REFBASE_BUCKET, source=cabs_out_path+'taxi_zones.zip', file='taxi_zones.zip')

            elif task == 'transit':
                # load station file
                stations_url: str = 'http://web.mta.info/developers/data/nyct/subway/Stations.csv'
                usecols: List[str] = ['Station ID', 'GTFS Stop ID', 'Stop Name', 'Borough',
                                      'GTFS Latitude', 'GTFS Longitude']
                stations_df: pd.DataFrame = pd.read_csv(stations_url, header=0, usecols=usecols,
                                                        encoding='utf-8')
                stations_df.rename(columns={'Station ID': 'station_id', 'GTFS Stop ID': 'stop_id',
                                            'Stop Name': 'stop_name', 'Borough': 'borough',
                                            'GTFS Latitude': 'latitude', 'GTFS Longitude': 'longitude'},
                                   inplace=True)

                stations_df.drop_duplicates(inplace=True)
                stations_df.dropna(inplace=True)

                # add fuzzy station name from turnstile data
                stations_df = add_fuzzy_station(df=stations_df)

                geometry: List[Point] = [Point(xy) for xy in zip(stations_df.longitude, stations_df.latitude)]
                stations_df.drop(['latitude', 'longitude'], axis=1, inplace=True)
                stations_geodf: GeoDataFrame = GeoDataFrame(stations_df, crs=crs, geometry=geometry)
                stations_out_path: str = '/tmp/transit-ref-out/'
                os.makedirs(stations_out_path, exist_ok=True)
                stations_filename: str = 'stations.shp'
                stations_geodf.to_file(stations_out_path+stations_filename)
                station_files: List[str] = glob.glob(stations_out_path+'*')
                os.chdir(stations_out_path)
                with ZipFile('stations.zip', 'w') as zipfile:
                    for file in station_files:
                        zipfile.write(file.rsplit('/', 1)[1])
                #ps.copy_files(dest_bucket=REFBASE_BUCKET, source_folder=stations_out_path)
                ps.copy_file(dest_bucket=REFBASE_BUCKET, source=stations_out_path+'stations.zip', file='stations.zip')

            elif task == 'traffic':
                # load traffic links file
                links_url: str = 'http://data.beta.nyc//dataset/e8facf61-2bb1-49e0-9128-5a8797b214c8/resource/1384aa3a-b7e2-4c28-9b5e-2808a07a7193/download/linkinfo.csv'
                cols: List[int] = [0, 1]
                names: List[str] = ['linkid', 'link']
                converters: Dict[str, Callable] = {
                                        'linkid': row_ops.clean_num
                                        }
                links_df: pd.DataFrame = pd.read_csv(links_url,
                                                        header=None,
                                                        usecols=cols,
                                                        names=names,
                                                        converters=converters,
                                                        encoding='utf-8')

                links_df.drop_duplicates(inplace=True)
                links_df.dropna(inplace=True)

                geometry = [LineString(build_coord_tuples(x)) for x in links_df.link]
                links_geodf = GeoDataFrame(links_df.drop('link', axis=1),
                                           crs=crs,
                                           geometry=geometry)

                links_out_path: str = '/tmp/traffic-ref-out/'
                os.makedirs(links_out_path, exist_ok=True)
                links_filename: str = 'traffic_links.shp'
                links_geodf.to_file(links_out_path+links_filename)
                links_files: List[str] = glob.glob(links_out_path+'*')
                os.chdir(links_out_path)
                with ZipFile('traffic_links.zip', 'w') as zipfile:
                    for file in links_files:
                        zipfile.write(file.rsplit('/', 1)[1])
                ps.copy_file(dest_bucket=REFBASE_BUCKET, source=links_out_path+'traffic_links.zip', file='traffic_links.zip')

            elif task == 'gas':
                # load gas data file
                filename: str ='gas.csv'
                cols = [0, 1]
                names = ['date', 'price']
                converters = {
                                        'price': row_ops.clean_num
                                        }
                gas_df: pd.DataFrame = pd.read_csv(ps.get_file_stream(bucket=OTHERS_BUCKET, filename=filename),
                                                        header=None,
                                                        usecols=cols,
                                                        parse_dates=['date'],
                                                        skiprows=2,
                                                        names=names,
                                                        converters=converters,
                                                        encoding='utf-8')

                file_io.write_csv(df=gas_df, bucket=REFBASE_BUCKET, filename=filename)

            elif task == 'weather':
                # load gas data file
                filename ='weather.csv'
                cols = [5, 8, 9, 12, 13]
                names = ['date', 'prcp', 'snow', 'tmax', 'tmin']
                converters = {
                                        'prcp': row_ops.clean_num,
                                        'snow': row_ops.clean_num,
                                        'tmax': row_ops.clean_num,
                                        'tmin': row_ops.clean_num
                                        }
                weather_df: pd.DataFrame = pd.read_csv(ps.get_file_stream(bucket=OTHERS_BUCKET, filename=filename),
                                                        header=None,
                                                        usecols=cols,
                                                        parse_dates=['date'],
                                                        skiprows=1,
                                                        names=names,
                                                        converters=converters,
                                                        encoding='utf-8')
                weather_df['temp'] = (weather_df['tmax']+weather_df['tmin'])/2
                weather_df = weather_df.drop(columns=['tmax', 'tmin'])
                file_io.write_csv(df=weather_df, bucket=REFBASE_BUCKET, filename=filename)

        else:
            print('unrecognized ref-base load task %s' % task)
            raise errors.TaskTypeError('ref-base load '+task)
    return True
示例#7
0
def plot_for_station(task: str,
                     freq: str,
                     filterby: str,
                     filterval: str,
                     station: str,
                     sub_task: str,
                     geomerged_cabs_df: DataFrame = None,
                     geomerged_traffic_df: DataFrame = None,
                     gas_df: DataFrame = None,
                     weather_df: DataFrame = None):
    import matplotlib.pyplot as plt
    try:
        #freq: str = task_map.task_type_map[task]['freq']
        range: List[str] = task_map.task_type_map[task]['range']
        start_date: str = range[0]
        end_date: str = range[1]
        # determine filename of transit data for
        # the current station in the rg-transit bucket
        # replace '/' in station with ' '
        file_path: str = freq + '/' + filterby + filterval + '/'
        ts_filename: str = file_path + station.replace('/', ' ').upper()

        # read transit data for station (rg-transit bucket)
        filestream = ps.get_file_stream(bucket=RGTRANSIT_BUCKET,
                                        filename=ts_filename)
        ts_datecols = ['datetime']
        dtypes = {'delex': 'int64', 'delent': 'int64'}
        transit_df = read_csv(filestream,
                              usecols=ts_datecols + list(dtypes.keys()),
                              parse_dates=ts_datecols,
                              date_parser=row_operations.parse_rg_dt,
                              encoding='utf-8',
                              dtype=dtypes)
        transit_df = transit_df.set_index('datetime').resample(
            freq).sum().loc[start_date:end_date]
        #print(transit_df.head())

        # read fares data for station complex using station_complex_map dictionary
        fares_file_path: str = freq + '/'
        fares_filename: str = fares_file_path + station_complex_map[
            station.upper()]
        filestream = ps.get_file_stream(bucket=RGFARES_BUCKET,
                                        filename=fares_filename)
        fares_datecols = ['date']
        fares_dtypes = {
            'FF': 'int64',
            'SEN/DIS': 'int64',
            #'7-D AFAS UNL': 'int64',
            #'30-D AFAS/RMF UNL': 'int64',
            #'JOINT RR TKT': 'int64',
            '7-D UNL': 'int64',
            '30-D UNL': 'int64',
            #'14-D RFM UNL': 'int64',
            #'1-D UNL': 'int64',
            #'14-D UNL': 'int64',
            #'7D-XBUS PASS': '******',
            #'TCMC': 'int64',
            #'RF 2 TRIP': 'int64',
            #'RR UNL NO TRADE': 'int64',
            #'TCMC ANNUAL MC': 'int64',
            #'MR EZPAY EXP': 'int64',
            #'MR EZPAY UNL': 'int64',
            #'PATH 2-T': 'int64',
            #'AIRTRAIN FF': 'int64',
            #'AIRTRAIN 30-D': 'int64',
            #'AIRTRAIN 10-T': 'int64',
            #'AIRTRAIN MTHLY': 'int64',
            'STUDENTS': 'int64'
        }
        fares_df = read_csv(filestream,
                            usecols=fares_datecols + list(fares_dtypes.keys()),
                            parse_dates=fares_datecols,
                            date_parser=row_operations.parse_rg_dt,
                            encoding='utf-8',
                            dtype=fares_dtypes)
        # fares data is for one week starting saturday and is dated as of the starting saturday.
        # shift dates from saturday to end of week (1W resampled data uses end-ofweek i.e. Sunday) used for rest of the data
        # shift by one day + one week to move to end of time period
        td: Timedelta = Timedelta(8, unit='d')
        fares_df[fares_datecols[0]] = fares_df[fares_datecols[0]] + td
        fares_df = melt(fares_df,
                        id_vars=fares_datecols,
                        var_name='fare_type',
                        value_name='total_users')
        fares_df = fares_df.groupby(fares_datecols + ['fare_type']).sum()
        fares_df = fares_df.reset_index().set_index(
            fares_datecols).sort_index().loc[start_date:end_date]

        # create plots
        tmp_filepath: str = '/tmp/'
        sns.set(font_scale=.65, rc={'lines.linewidth': 1})
        sns.set_style('dark')
        plt.close('all')
        fig, axes = plt.subplots(nrows=2, ncols=2, clear=True, figsize=(18, 6))
        plt.subplots_adjust(wspace=.4, hspace=.2)
        ts_col1 = 'delex'
        ts_col2 = 'delent'
        ts_label = 'transit '

        if sub_task in ['gcabs', 'ycabs']:
            # read data from other in buckets
            cabs_datecols = ['dodatetime']

            # determine relevant cabs files
            # by finding dolocationids corresponding
            # to current station from ref-base geomerged df
            dolocationids = geomerged_cabs_df.loc[geomerged_cabs_df.tsstation
                                                  == station][[
                                                      'locationid', 'weight'
                                                  ]]

            cabs_dtypes = {
                'dolocationid': 'int64',
                'passengers': 'int64',
                'distance': 'float64'
            }

            if sub_task == 'gcabs':
                gcabs_df: DataFrame
                gcabs_df = concat([
                    read_csv(ps.get_file_stream(
                        bucket=RGGCABS_BUCKET,
                        filename=file_path + str(locationid)),
                             header=0,
                             usecols=cabs_datecols + list(cabs_dtypes.keys()),
                             parse_dates=cabs_datecols,
                             encoding='utf-8',
                             dtype=cabs_dtypes)
                    for locationid in dolocationids['locationid']
                    if str(locationid) in ps.get_all_filenames(
                        bucket=RGGCABS_BUCKET, path=file_path)
                ],
                                  ignore_index=True)
                gcabs_df = gcabs_df.merge(dolocationids, left_on='dolocationid', right_on='locationid', how='left', copy=False).\
                    drop(columns=['dolocationid', 'locationid']).drop_duplicates()

                gcabs_df = gcabs_df.set_index(cabs_datecols, 'weight').groupby(
                    Grouper(freq=freq, level=0)).agg({
                        'passengers': 'sum',
                        'distance': 'sum',
                        'weight': 'first'
                    }).loc[start_date:end_date]
                #print(gcabs_df.head())

                # plots for cabs
                if dolocationids.size > 0 and gcabs_df.size > 0:
                    gcabs_label = 'green cabs '
                    gcabs_col = 'passengers'
                    create_plot(df1=transit_df,
                                varcol1=ts_col1,
                                label1=ts_label + 'exits',
                                df2=gcabs_df,
                                varcol2=gcabs_col,
                                label2=gcabs_label + gcabs_col,
                                ax=axes[0, 0],
                                weighted=True,
                                weight_col='weight',
                                station=station,
                                weekday=int(filterval))

                    create_plot(df1=transit_df,
                                varcol1=ts_col2,
                                label1=ts_label + 'entries',
                                df2=gcabs_df,
                                varcol2=gcabs_col,
                                label2=gcabs_label + gcabs_col,
                                ax=axes[0, 1],
                                weighted=True,
                                weight_col='weight',
                                station=station,
                                weekday=int(filterval))

                    df = transit_df.join(gcabs_df, how='outer') \
                        [[ts_col1, ts_col2, gcabs_col, 'weight']]
                    create_reg_plot(df=df,
                                    varcol1=ts_col1,
                                    label1=ts_label + 'exits',
                                    varcol2=gcabs_col,
                                    label2=gcabs_label + gcabs_col,
                                    ax=axes[1, 0],
                                    weighted=True,
                                    weight_col='weight')
                    create_reg_plot(df=df,
                                    varcol1=ts_col2,
                                    label1=ts_label + 'entries',
                                    varcol2=gcabs_col,
                                    label2=gcabs_label + gcabs_col,
                                    ax=axes[1, 1],
                                    weighted=True,
                                    weight_col='weight')

            elif sub_task == 'ycabs':
                ycabs_df: DataFrame
                ycabs_df = concat([
                    read_csv(ps.get_file_stream(
                        bucket=RGYCABS_BUCKET,
                        filename=file_path + str(locationid)),
                             header=0,
                             usecols=cabs_datecols + list(cabs_dtypes.keys()),
                             parse_dates=cabs_datecols,
                             encoding='utf-8',
                             dtype=cabs_dtypes)
                    for locationid in dolocationids['locationid']
                    if str(locationid) in ps.get_all_filenames(
                        bucket=RGYCABS_BUCKET, path=file_path)
                ],
                                  ignore_index=True)
                ycabs_df = ycabs_df.merge(dolocationids, left_on='dolocationid', right_on='locationid', how='left',
                                          copy=False). \
                    drop(columns=['dolocationid', 'locationid']).drop_duplicates()
                ycabs_df = ycabs_df.set_index(cabs_datecols, 'weight').groupby(
                    Grouper(freq=freq, level=0)).agg({
                        'passengers': 'sum',
                        'distance': 'sum',
                        'weight': 'first'
                    }).loc[start_date:end_date]

                #print(ycabs_df.head())

                # plots for cabs
                if dolocationids.size > 0 and ycabs_df.size > 0:
                    ycabs_label = 'yellow cabs '
                    ycabs_col = 'passengers'
                    create_plot(df1=transit_df,
                                varcol1=ts_col1,
                                label1=ts_label + 'exits',
                                df2=ycabs_df,
                                varcol2=ycabs_col,
                                label2=ycabs_label + ycabs_col,
                                ax=axes[0, 0],
                                weighted=True,
                                weight_col='weight',
                                station=station,
                                weekday=int(filterval))

                    create_plot(df1=transit_df,
                                varcol1=ts_col2,
                                label1=ts_label + 'entries',
                                df2=ycabs_df,
                                varcol2=ycabs_col,
                                label2=ycabs_label + ycabs_col,
                                ax=axes[0, 1],
                                weighted=True,
                                weight_col='weight',
                                station=station,
                                weekday=int(filterval))

                    df = transit_df.join(ycabs_df, how='outer') \
                        [[ts_col1, ts_col2, ycabs_col, 'weight']]
                    create_reg_plot(df=df,
                                    varcol1=ts_col1,
                                    label1=ts_label + 'exits',
                                    varcol2=ycabs_col,
                                    label2=ycabs_label + ycabs_col,
                                    ax=axes[1, 0],
                                    weighted=True,
                                    weight_col='weight')
                    create_reg_plot(df=df,
                                    varcol1=ts_col2,
                                    label1=ts_label + 'entries',
                                    varcol2=ycabs_col,
                                    label2=ycabs_label + ycabs_col,
                                    ax=axes[1, 1],
                                    weighted=True,
                                    weight_col='weight')

        elif sub_task == 'traffic':
            # determine relevant traffic files
            # by finding linkids corresponding
            # to current station from ref-base geomerged traffic df
            traffic_df: DataFrame
            traffic_datecols = ['datetime']
            linkids = geomerged_traffic_df.loc[geomerged_traffic_df.tsstation
                                               == station][[
                                                   'linkid', 'weight'
                                               ]]

            if linkids.size > 0:
                traffic_dtypes = {
                    'linkid': 'int64',
                    'speed': 'float64',
                    'traveltime': 'float64'
                }
                traffic_cols = list(traffic_dtypes.keys())
                traffic_df = concat([
                    read_csv(ps.get_file_stream(
                        bucket=RGTRAFFIC_BUCKET,
                        filename=file_path + str(int(linkid))),
                             header=0,
                             usecols=traffic_datecols + traffic_cols,
                             parse_dates=traffic_datecols,
                             encoding='utf-8',
                             dtype=traffic_dtypes)
                    for linkid in linkids['linkid'] if str(int(linkid)) in ps.
                    get_all_filenames(bucket=RGTRAFFIC_BUCKET, path=file_path)
                ],
                                    ignore_index=True)
                traffic_df = traffic_df.merge(
                    linkids, on='linkid', how='left',
                    copy=False).drop(columns=['linkid']).drop_duplicates()
                traffic_df = traffic_df.set_index(
                    traffic_datecols,
                    'weight').groupby(Grouper(freq=freq, level=0)).agg({
                        'speed':
                        'mean',
                        'traveltime':
                        'mean',
                        'weight':
                        'first'
                    }).loc[start_date:end_date]
                #print(traffic_df.head())
                # drop outliers
                #traffic_df = row_operations.drop_outliers(traffic_df, 'speed')

            if linkids.size > 0 and transit_df.size > 0:
                tr_label = 'traffic '
                tr_col = 'speed'
                create_plot(df1=transit_df,
                            varcol1=ts_col1,
                            label1=ts_label + 'exits',
                            df2=traffic_df,
                            varcol2=tr_col,
                            label2=tr_label + tr_col,
                            ax=axes[0, 0],
                            weighted=True,
                            weight_col='weight',
                            station=station,
                            weekday=int(filterval))

                create_plot(df1=transit_df,
                            varcol1=ts_col2,
                            label1=ts_label + 'entries',
                            df2=traffic_df,
                            varcol2=tr_col,
                            label2=tr_label + tr_col,
                            ax=axes[0, 1],
                            weighted=True,
                            weight_col='weight',
                            station=station,
                            weekday=int(filterval))

                df = transit_df.join(traffic_df, how='outer') \
                    [[ts_col1, ts_col2, tr_col, 'weight']]
                create_reg_plot(df=df,
                                varcol1=ts_col1,
                                label1=ts_label + 'exits',
                                varcol2=tr_col,
                                label2=tr_label + tr_col,
                                ax=axes[1, 0],
                                weighted=True,
                                weight_col='weight')
                create_reg_plot(df=df,
                                varcol1=ts_col2,
                                label1=ts_label + 'entries',
                                varcol2=tr_col,
                                label2=tr_label + tr_col,
                                ax=axes[1, 1],
                                weighted=True,
                                weight_col='weight')

        elif sub_task == 'fares':
            tsf_label = 'transit fares'
            tsf_col = 'total_users'
            create_plot(df1=transit_df,
                        varcol1=ts_col1,
                        label1=ts_label + 'exits',
                        df2=fares_df,
                        varcol2=tsf_col,
                        label2=tsf_label,
                        ax=axes[0, 0],
                        multiplot=True,
                        multicol='fare_type',
                        station=station,
                        weekday=int(filterval))

            create_plot(df1=transit_df,
                        varcol1=ts_col2,
                        label1=ts_label + 'entries',
                        df2=fares_df,
                        varcol2=tsf_col,
                        label2=tsf_label,
                        ax=axes[0, 1],
                        multiplot=True,
                        multicol='fare_type',
                        station=station,
                        weekday=int(filterval))

            df = transit_df.join(fares_df, how='outer') \
                [[ts_col1, ts_col2, tsf_col, 'fare_type']]
            #df = df.groupby(Grouper(freq=freq, level=0), 'fare_type').sum()
            create_reg_plot(df=df,
                            varcol1=ts_col1,
                            label1=ts_label + 'exits',
                            varcol2=tsf_col,
                            label2=tsf_label,
                            ax=axes[1, 0],
                            multiplot=True,
                            multicol='fare_type')
            create_reg_plot(df=df,
                            varcol1=ts_col2,
                            label1=ts_label + 'entries',
                            varcol2=tsf_col,
                            label2=tsf_label,
                            ax=axes[1, 1],
                            multiplot=True,
                            multicol='fare_type')

        elif sub_task == 'gas':

            # gas
            gas_label = 'gas '
            gas_col = 'price'
            create_plot(df1=transit_df,
                        varcol1=ts_col1,
                        label1=ts_label + 'exits',
                        df2=gas_df,
                        varcol2=gas_col,
                        label2=gas_label + gas_col,
                        ax=axes[0, 0],
                        station=station,
                        weekday=int(filterval))

            create_plot(df1=transit_df,
                        varcol1=ts_col2,
                        label1=ts_label + 'entries',
                        df2=gas_df,
                        varcol2=gas_col,
                        label2=gas_label + gas_col,
                        ax=axes[0, 1],
                        station=station,
                        weekday=int(filterval))

            df = transit_df.join(gas_df, how='outer') \
                [[ts_col1, ts_col2, gas_col]].groupby(Grouper(freq=freq, level=0)).agg({ts_col1: 'sum',
                                                                                        ts_col2: 'sum',
                                                                                         gas_col: 'sum'})
            # drop outliers
            #df = row_operations.drop_outliers(df, 'price')
            create_reg_plot(df=df,
                            varcol1=ts_col1,
                            label1=ts_label + 'exits',
                            varcol2=gas_col,
                            label2=gas_label + gas_col,
                            ax=axes[1, 0])
            create_reg_plot(df=df,
                            varcol1=ts_col2,
                            label1=ts_label + 'entries',
                            varcol2=gas_col,
                            label2=gas_label + gas_col,
                            ax=axes[1, 1])

        elif sub_task == 'weather':
            # weather
            wr_label = 'weather '
            wr_col = 'temp'
            create_plot(df1=transit_df,
                        varcol1=ts_col1,
                        label1=ts_label + 'exits',
                        df2=weather_df,
                        varcol2=wr_col,
                        label2=wr_label + wr_col,
                        ax=axes[0, 0],
                        station=station,
                        weekday=int(filterval))

            create_plot(df1=transit_df,
                        varcol1=ts_col2,
                        label1=ts_label + 'entries',
                        df2=weather_df,
                        varcol2=wr_col,
                        label2=wr_label + wr_col,
                        ax=axes[0, 1],
                        station=station,
                        weekday=int(filterval))

            df = transit_df.join(weather_df, how='outer') \
                [[ts_col1, ts_col2, wr_col]]
            create_reg_plot(df=df,
                            varcol1=ts_col1,
                            label1=ts_label + 'exits',
                            varcol2=wr_col,
                            label2=wr_label + wr_col,
                            ax=axes[1, 0])
            create_reg_plot(df=df,
                            varcol1=ts_col2,
                            label1=ts_label + 'entries',
                            varcol2=wr_col,
                            label2=wr_label + wr_col,
                            ax=axes[1, 1])

        else:
            raise errors.TaskTypeError(sub_task)

        fig.tight_layout()
        # save plots in out bucket
        filename = sub_task + EXT
        local_filename = station + '_' + filename
        remote_filename = station + '/' + filename
        local_file = tmp_filepath + local_filename
        fig.savefig(local_file)
        ps.copy_file(dest_bucket=PLOTS_BUCKET,
                     file=file_path + '/' + remote_filename,
                     source=local_file)
        print('saved pdf - %(task)s %(station)s' % {
            'task': task,
            'station': station
        })

    except Exception as err:
        print(
            'Error in plotting task %(task)s sub-task %(sub_task)s for station %(station)s'
            % {
                'task': task,
                'sub_task': sub_task,
                'station': station
            })
        raise err

    return