示例#1
0
def perform_transit_dask(task_type: str, years: List[str]) -> bool:

    task_type_map: Dict = task_map.task_type_map[task_type]
    in_bucket: str = task_type_map['in']
    out_bucket: str = task_type_map['out']

    status: bool = False
    try:
        client: Client = dask.create_dask_client(num_workers=8)
        s3_options: Dict = ps.fetch_s3_options()
        month_st: int = 1
        month_end: int = 13
        calendar: cal.Calendar = cal.Calendar()
        for year in years:
            usecols = [3, 6, 7, 9, 10]
            names = ['station', 'date', 'time', 'entries', 'exits']
            url_part1: str = 's3://' + in_bucket + '/turnstile_'
            url_part2: str = ".txt"
            # urls for all saturdays in month range for year
            urls: List[str] = [
                url_part1 + year[2:] + prefix_zero(month) +
                prefix_zero(day_tuple[0]) + url_part2
                for month in range(month_st, month_end)
                for day_tuple in calendar.itermonthdays2(int(year), month)
                if day_tuple[0] in range(1, 32) and day_tuple[1] == 5
            ]

            #for url in urls:
            #    print(url)
            df = dd.read_csv(urlpath=urls,
                             storage_options=s3_options,
                             header=None,
                             usecols=usecols,
                             names=names,
                             parse_dates={'datetime': ['date', 'time']},
                             date_parser=row_ops.clean_transit_date,
                             skipinitialspace=True,
                             skip_blank_lines=True,
                             converters={
                                 'entries': row_ops.clean_num,
                                 'exits': row_ops.clean_num
                             },
                             encoding='utf-8')

            to_parquet(df=df,
                       out_bucket=out_bucket,
                       folder=year + '/',
                       compute=True)

    except Exception as err:
        raise err

    else:
        return status
示例#2
0
def perform_transit_dask(task_type: str, years: List[str]) -> bool:
    task_type_map: Dict = task_map.task_type_map[task_type]
    in_bucket: str = task_type_map['in']
    out_bucket: str = task_type_map['out']

    client: Client = dask.create_dask_client(num_workers=8)
    s3_in_prefix: str = 's3://' + in_bucket + '/'
    try:
        s3_options: Dict = ps.fetch_s3_options()

        for year in years:

            s3_out_url: str = 's3://' + out_bucket + '/' + year + '/'
            s3_in_url: str = s3_in_prefix + year

            df = dd.read_parquet(path=s3_in_url,
                                 storage_options=s3_options,
                                 engine='fastparquet')

            df['delex'] = df['exits'].diff()
            df['delent'] = df['entries'].diff()
            df = df.drop(['exits', 'entries'], axis=1)
            df = df.dropna()

            delex_lo_q = df['delex'].quantile(.25)
            delent_lo_q = df['delent'].quantile(.25)
            delex_hi_q = df['delex'].quantile(.75)
            delent_hi_q = df['delent'].quantile(.75)
            delex_iqr = delex_hi_q - delex_lo_q
            delent_iqr = delent_hi_q - delent_lo_q
            discard = (df['delex'] < delex_lo_q - 1.5 * delex_iqr) | \
                      (df['delex'] > delex_hi_q + 1.5 * delex_iqr) | \
                      (df['delent'] < delent_lo_q - 1.5 * delent_iqr) | \
                      (df['delent'] > delent_hi_q + 1.5 * delent_iqr)
            df = df.loc[~discard]

            dd.to_parquet(df=df,
                          path=s3_out_url,
                          engine='fastparquet',
                          compute=True,
                          compression='lz4',
                          storage_options=s3_options)

    except Exception as err:
        print('error in perform_transit %s' % str(err))
        client.close()
        raise err

    client.close()

    return True
示例#3
0
def regroup_dask(task_type: str, years: List[str], resample_freq: str,
                 filter_key: str, filter_val: str) -> bool:
    try:
        # determine in and out buckets
        # and split_by from task type map
        in_bucket: str = task_map.task_type_map[task_type]['in']
        out_bucket: str = task_map.task_type_map[task_type]['out']
        split_by: List[str] = task_map.task_type_map[task_type]['split_by']
        date_cols: List[str] = task_map.task_type_map[task_type]['date_cols']
        dtypes: Dict = task_map.task_type_map[task_type]['dtypes']
        print('fetched in out and split_by for task_type %(task)s' %
              {'task': task_type})

        # read files from in bucket and concat into one df
        s3_options: Dict = ps.fetch_s3_options()
        client: Client = dask.create_dask_client(num_workers=8)

        # create out bucket
        ps.create_bucket(out_bucket)

        s3_in_url: str = 's3://' + in_bucket + '/'
        s3_sub_path: str = resample_freq + '/' + filter_key + filter_val + '/'
        if task_type == 'rg-tsfare':
            s3_sub_path = ''
        df = dd.concat([
            dd.read_csv(urlpath=s3_in_url + year + '/' + s3_sub_path + '*',
                        storage_options=s3_options,
                        parse_dates=date_cols,
                        dtype=dtypes) for year in years
        ])

        print('read files from in bucket and concat-ted into one df')
        fillna_dict: Dict = {key: 0 for key in dtypes}
        df.fillna(fillna_dict)
        if task_type == 'rg-tsfare':
            s3_sub_path = resample_freq + '/'
        df.groupby(split_by).apply(partial(write_group_to_csv,
                                           split_by=split_by,
                                           out_bucket=out_bucket,
                                           out_path=s3_sub_path),
                                   meta=('int')).compute()

    except Exception as err:
        print('Error: %(error)s in regrouper for task_type %(task)s' % {
            'error': err,
            'task': task_type
        })
        raise err

    return True
示例#4
0
def perform_traffic_dask(task_type: str, years: List[str]) -> bool:
    task_type_map: Dict = task_map.task_type_map[task_type]
    in_bucket: str = task_type_map['in']
    out_bucket: str = task_type_map['out']

    client: Client = dask.create_dask_client(num_workers=8)
    s3_in_prefix: str = 's3://' + in_bucket + '/'
    try:
        s3_options: Dict = ps.fetch_s3_options()
        usecols = [1, 2, 4, 5]
        names = ['speed', 'traveltime', 'datetime', 'linkid']

        for year in years:

            s3_out_url: str = 's3://' + out_bucket + '/' + year + '/'
            s3_in_url: str = s3_in_prefix + '*' + year + '.csv'

            df = dd.read_csv(urlpath=s3_in_url,
                             storage_options=s3_options,
                             header=None,
                             usecols=usecols,
                             names=names,
                             parse_dates=['datetime'],
                             date_parser=row_ops.clean_traffic_date,
                             skipinitialspace=True,
                             skip_blank_lines=True,
                             converters={
                                 'speed': row_ops.clean_num,
                                 'traveltime': row_ops.clean_num,
                                 'linkid': row_ops.clean_num
                             },
                             encoding='utf-8')

            dd.to_parquet(df=df,
                          path=s3_out_url,
                          engine='fastparquet',
                          compute=True,
                          compression='GZIP',
                          storage_options=s3_options)

    except Exception as err:
        print('error in perform_transit %s' % str(err))
        client.close()
        raise err

    client.close()

    return True
示例#5
0
def perform_cabs_dask(task_type: str, years: List[str]) -> bool:
    task_type_map: Dict = task_map.task_type_map[task_type]
    in_bucket: str = task_type_map['in']
    out_bucket: str = task_type_map['out']

    client: Client = dask.create_dask_client(num_workers=8)
    special_case: bool = False
    normal_case: bool = False
    s3_in_prefix: str = 's3://' + in_bucket + '/'
    try:
        s3_options: Dict = ps.fetch_s3_options()

        for year in years:

            s3_out_url: str = 's3://' + out_bucket + '/' + year + '/'
            s3_in_url: str = s3_in_prefix + year
            if int(year) == 2016:
                special_case = True
                normal_case = True
            elif int(year) > 2016:
                special_case = True
                normal_case = False
            elif int(year) < 2016:
                special_case = False
                normal_case = True

            if special_case:
                clean_cabs_at_path(special=True,
                                   s3_in_url=s3_in_url + '/special/',
                                   s3_out_url=s3_out_url + '/special/',
                                   s3_options=s3_options)

            if normal_case:
                clean_cabs_at_path(special=False,
                                   s3_in_url=s3_in_url + '/normal/',
                                   s3_out_url=s3_out_url + '/normal/',
                                   s3_options=s3_options)

    except Exception as err:
        print('error in perform_cabs %s' % str(err))
        client.close()
        raise err

    client.close()

    return True
示例#6
0
def to_parquet(df: dd.DataFrame,
               out_bucket: str,
               folder: str,
               compute: bool = True) -> bool:
    try:
        s3_out_url: str = 's3://' + out_bucket + '/' + folder
        s3_options: Dict = ps.fetch_s3_options()
        dd.to_parquet(df=df,
                      path=s3_out_url,
                      engine='fastparquet',
                      compute=compute,
                      compression='lz4',
                      storage_options=s3_options)
    except Exception as err:
        print('error while saving to parquet to path %(path)s - %(error)s' % {
            'path': out_bucket + '/' + folder,
            'error': str(err)
        })
        raise err
    else:
        return True
示例#7
0
def run_pipeline(task_type: str) -> bool:
    map: Dict = task_type_map[task_type]
    in_bucket: str = map['in']
    out_bucket: str = map['out']
    cols: Dict[str, str] = map['cols']
    converters: Dict[str, Callable] = map['converters']
    dtypes: Dict[str, str] = map['dtypes']
    index_col: str = map['index']['col']
    sorted: bool = map['index']['sorted']
    row_op: Callable = map['row_op']
    diff: Dict = map['diff']
    filter_by_key: str = resample_map['filter_by']['key']
    filter_by_val: int = resample_map['filter_by']['value']
    resample_freq: str = resample_map['freq']
    aggr_func: Callable = map['aggr_func']

    try:

        #client = Client(address='dscheduler:8786')

        s3_in_url: str = 's3://'+in_bucket+'/*.*'
        s3_options: Dict = ps.fetch_s3_options()
        #df = dd.read_table(path=s3_in_url, storage_options=s3_options)
        df = dd.read_table(urlpath='tmp/'+in_bucket+'/*.*',
                           header=0,
                           usecols=lambda x: x.upper() in list(cols.keys()),
                           skipinitialspace=True,
                           converters=converters
                           )

        # rename columns
        df = df.rename(columns=cols)
        df.compute()

        if sorted:
            df = df.map_partitions(lambda pdf: pdf.rename(columns=cols)
                                   .apply(func=row_op, axis=1),
                                   meta=dtypes).compute()
        else:
            df = df.map_partitions(lambda pdf: pdf.rename(columns=cols)
                                   .set_index(index_col).sort().reset_index()
                                   .apply(func=row_op, axis=1),
                                   meta=dtypes).compute()



        # map row-wise operations
        #df = df.map_partitions(lambda pdf: pdf.apply(func=row_op, axis=1), meta=dtypes)

        # diff
        if diff['compute']:
            df[diff['new_col']] = df[diff['col']].diff()

        # specific processing for transit
        if task_type == 'cl-transit':
            df = df.map_partitions(partial(remove_outliers, col='DELEXITS'), meta=dtypes)

        # drop na values
        df = df.dropna()

        # set index (assumes pre-sorted data)
        df = df.set_index(index_col, sorted=True)

        #df.compute()

        # filter
        if filter_by_key == 'weekday':
            df = df.loc[df[index_col].weekday() == filter_by_val]

        # resample using frequency and aggregate function specified
        df = compose(df.resample(resample_freq), aggr_func)

        # save in out bucket
        s3_out_url: str = 's3://' + out_bucket
        # dd.to_parquet(df=df, path=s3_out_url, storage_options=s3_options)
        dd.to_parquet(df=df, path='tmp/'+out_bucket+'/*.*')

    except Exception as err:
        print('error in run_pipeline %s' % str(err))
        raise err

    return True
示例#8
0
def perform_traffic_dask(task_type: str, years: List[str]) -> bool:

    task_type_map: Dict = task_map.task_type_map[task_type]
    in_bucket: str = task_type_map['in']
    out_bucket: str = task_type_map['out']

    status: bool = False
    try:
        client: Client = dask.create_dask_client(num_workers=8)
        s3_options: Dict = ps.fetch_s3_options()
        month_st: int = 1
        month_end: int = 13
        calendar: cal.Calendar = cal.Calendar()
        for year in years:
            if year in ['2016', '2017']:
                month_st = 1
                month_end = 13
            elif year == '2015':
                month_st = 4
                month_end = 13
            elif year == '2018':
                month_st = 1
                month_end = 10
            usecols = [1, 2, 4, 5]
            names = ['speed', 'traveltime', 'datetime', 'linkid']
            url_part1: str = 's3://' + in_bucket + '/'
            url_part2: str = ".csv"
            # urls for all saturdays in month range for year
            urls: List[str] = [
                url_part1 + prefix_zero(month) + year + url_part2
                for month in range(month_st, month_end)
            ]

            #for url in urls:
            #    print(url)
            df = dd.read_csv(urlpath=urls,
                             storage_options=s3_options,
                             header=None,
                             usecols=usecols,
                             names=names,
                             parse_dates=['datetime'],
                             date_parser=row_ops.clean_traffic_date,
                             skipinitialspace=True,
                             skip_blank_lines=True,
                             converters={
                                 'speed': row_ops.clean_num,
                                 'traveltime': row_ops.clean_num,
                                 'linkid': row_ops.clean_num
                             },
                             encoding='utf-8')

            to_parquet(df=df,
                       out_bucket=out_bucket,
                       folder=year + '/',
                       compute=True)

            #dd.to_csv(df=df,
            #          filename='s3://'+out_bucket+'/'+year+'/',
            #          #name_function=lambda i: out_file_prefix + '_' + str(i),
            #          storage_options=s3_options)

    except Exception as err:
        raise err

    else:
        return status
示例#9
0
def perform_tsfare_dask(task_type: str, years: List[str]) -> bool:

    task_type_map: Dict = task_map.task_type_map[task_type]
    in_bucket: str = task_type_map['in']
    out_bucket: str = task_type_map['out']

    status: bool = False
    try:
        client: Client = dask.create_dask_client(num_workers=8)
        s3_options: Dict = ps.fetch_s3_options()
        month_st: int = 1
        month_end: int = 13
        calendar: cal.Calendar = cal.Calendar()
        for year in years:
            usecols = [
                'date', 'STATION', 'FF', 'SEN/DIS', '7-D AFAS UNL',
                '30-D AFAS/RMF UNL', 'JOINT RR TKT', '7-D UNL', '30-D UNL',
                '14-D RFM UNL', '1-D UNL', '14-D UNL', '7D-XBUS PASS', 'TCMC',
                'RF 2 TRIP', 'RR UNL NO TRADE', 'TCMC ANNUAL MC',
                'MR EZPAY EXP', 'MR EZPAY UNL', 'PATH 2-T', 'AIRTRAIN FF',
                'AIRTRAIN 30-D', 'AIRTRAIN 10-T', 'AIRTRAIN MTHLY', 'STUDENTS'
            ]
            url_part1: str = 's3://' + in_bucket + '/fares_'
            url_part2: str = ".csv"
            # urls for all saturdays in month range for year
            urls: List[str] = [
                url_part1 + year[2:] + prefix_zero(month) +
                prefix_zero(day_tuple[0]) + url_part2
                for month in range(month_st, month_end)
                for day_tuple in calendar.itermonthdays2(int(year), month)
                if day_tuple[0] in range(1, 32) and day_tuple[1] == 5
            ]

            #for url in urls:
            #    print(url)
            df = dd.read_csv(urlpath=urls,
                             storage_options=s3_options,
                             header=0,
                             usecols=usecols,
                             skipinitialspace=True,
                             skip_blank_lines=True,
                             parse_dates=['date'],
                             converters={
                                 'STATION': str.strip,
                                 'FF': row_ops.clean_num,
                                 'SEN/DIS': row_ops.clean_num,
                                 '7-D AFAS UNL': row_ops.clean_num,
                                 '30-D AFAS/RMF UNL': row_ops.clean_num,
                                 'JOINT RR TKT': row_ops.clean_num,
                                 '7-D UNL': row_ops.clean_num,
                                 '30-D UNL': row_ops.clean_num,
                                 '14-D RFM UNL': row_ops.clean_num,
                                 '1-D UNL': row_ops.clean_num,
                                 '14-D UNL': row_ops.clean_num,
                                 '7D-XBUS PASS': row_ops.clean_num,
                                 'TCMC': row_ops.clean_num,
                                 'RF 2 TRIP': row_ops.clean_num,
                                 'RR UNL NO TRADE': row_ops.clean_num,
                                 'TCMC ANNUAL MC': row_ops.clean_num,
                                 'MR EZPAY EXP': row_ops.clean_num,
                                 'MR EZPAY UNL': row_ops.clean_num,
                                 'PATH 2-T': row_ops.clean_num,
                                 'AIRTRAIN FF': row_ops.clean_num,
                                 'AIRTRAIN 30-D': row_ops.clean_num,
                                 'AIRTRAIN 10-T': row_ops.clean_num,
                                 'AIRTRAIN MTHLY': row_ops.clean_num,
                                 'STUDENTS': row_ops.clean_num
                             },
                             encoding='utf-8')
            #to_parquet(df=df, out_bucket=out_bucket, folder=year + '/', compute=True)
            dd.to_csv(
                df=df,
                filename='s3://' + out_bucket + '/' + year + '/',
                #name_function=lambda i: out_file_prefix + '_' + str(i),
                storage_options=s3_options)

    except Exception as err:
        raise err

    else:
        return status
示例#10
0
def perform_dask(task_type: str, years: List[str]) -> bool:

    task_type_map: Dict = task_map.task_type_map[task_type]
    in_bucket: str = task_type_map['in']
    out_bucket: str = task_type_map['out']
    group: Dict = task_type_map['group']
    index_col: str = task_type_map['index']['col']

    aggr_func: Callable
    filter_by_key: str = resample_map['filter_by']['key']
    filter_by_val: int = resample_map['filter_by']['value']
    resample_freq: str = resample_map['freq']

    s3_options: Dict = ps.fetch_s3_options()

    client: Client = dask.create_dask_client(num_workers=8)

    try:
        for year in years:
            s3_in_url: str = 's3://' + in_bucket + '/' + year + '/'
            s3_out_url: str = 's3://' + out_bucket + '/' + year + '/' \
                              + resample_freq + '/' + filter_by_key+str(filter_by_val) + '/'
            path: str = ''
            print('s3 url %s' % s3_in_url)
            if task_type in ['rs-gcabs', 'rs-ycabs']:
                if int(year) >= 2016:
                    path = '/special/'
                elif int(year) < 2016:
                    path = '/normal/'

            #resample_at_path(s3_in_url+path,
            #                 s3_out_url,
            #                 s3_options,
            #                 group,
            #                 index_col)

            df = dd.read_parquet(path=s3_in_url + path,
                                 storage_options=s3_options,
                                 engine='fastparquet')

            if task_type in ['rs-gcabs', 'rs-ycabs'] and int(year) == 2016:
                #resample_at_path(s3_in_url + '/normal/',
                #                 s3_out_url,
                #                 s3_options,
                #                 group,
                #                 index_col,
                #                 'out2')
                df_2 = dd.read_parquet(path=s3_in_url + '/normal/',
                                       storage_options=s3_options,
                                       engine='fastparquet')
                df = dd.concat([df, df_2], axis=0)

            partitions = df.npartitions
            if partitions < 5:
                print('repartitioning to 5')
                df = df.repartition(npartitions=5)
                client.persist(df)

            # filter
            if filter_by_key == 'weekday':
                df = df.loc[df[index_col].dt.weekday == filter_by_val]

            if group['compute']:
                grouper_cols = group['by_cols']
                aggr_func = group['aggr_func']
                meta_cols = group['meta']
                cols = [
                    col for col in meta_cols.keys()
                    if col not in grouper_cols + [index_col]
                ]
                meta_types = [
                    meta_cols[key] for key in meta_cols.keys()
                    if key not in grouper_cols + [index_col]
                ]
                print('meta_cols %s' % meta_cols)
                index = [index_col] + grouper_cols
                index_levels: List[List] = [[] for level in index]
                meta: pd.DataFrame = pd.DataFrame(columns=cols,
                                                  index=pd.MultiIndex(
                                                      index_levels,
                                                      index_levels,
                                                      names=index))

                # resample using frequency and aggregate function specified
                df = df.groupby([pd.Grouper(key=index_col, freq=resample_freq)] + grouper_cols)[cols]. \
                    apply(aggr_func, meta=meta).reset_index()
                # df = df.resample(resample_freq).sum()
                # print('after resampling')

            print('after grouping and resampling %s' % str(df.shape))

            # save in out bucket
            dd.to_csv(
                df=df,
                filename=s3_out_url,
                #name_function=lambda i: out_file_prefix + '_' + str(i),
                storage_options=s3_options)

            #dd.to_parquet(df=df,
            #              path=s3_out_url,
            #              engine='fastparquet',
            #compute=True,
            #write_index=True,
            #              compression='lz4',
            #              storage_options=s3_options)

    except Exception as err:
        print('error in perform_cabs %s')
        client.close()
        raise err

    client.close()

    return True