def main(env, days): bucket = get_bucket_name(env) s3 = s3Toolkit(bucket) logger.info('reading fact_order_metrics') df_fo = download_and_read(prefix='tables/redshift/fact-order-metrics', days=days, s3=s3) cost_allocator_ppc(df_fo_source=fo_bing_filter(df_fo), source_name='bing', levels=['date', 'brandcode', 'countrycode', 'device'], env=env, days=days) cost_allocator_ppc(df_fo_source=fo_facebook_filter(df_fo), source_name='facebook', levels=['date', 'brandcode'], env=env, days=days) cost_allocator_ppc(df_fo_source=fo_adwords_filter(df_fo), source_name='adwords', levels=['date', 'adwords_campaign_id', 'device'], env=env, days=days) cost_allocator_ppc(df_fo_source=fo_criteo_filter(df_fo), source_name='criteo', levels=['date', 'brandcode', 'countrycode'], env=env, days=days) cost_allocator_sovendus(df_fo_source=fo_sovendus_filter(df_fo), env=env) cost_allocator_awin(df_fo_source=df_fo, env=env, days=days) update_table_partition(env, table_name='order_cost')
def cost_allocator_ppc(df_fo_source, source_name, levels, env, days): """ Use to allocate cost for ppc sources and upload results to s3 Parameters ---------- df_fo_source: DataFrame DataFrame from fact_order_metrics after sliced for specific data source orders source_name: string Name of the data source levels: list detail level that cost will be allocated on """ folder_source = f'tables/funnel/{source_name}' bucket = get_bucket_name(env) s3 = s3Toolkit(bucket) # reading cost table logger.info(f'reading {folder_source}') df_source = download_and_read(prefix=folder_source, days=days,s3=s3) # allocating cost logger.info('allocating cost') df_order_cost = cost_allocation_logic_ppc(df_fo_source, df_source, levels, source_name) # uploading to s3 logger.info(f'uploading partitioned {source_name}.csv') s3.partition_table_upload(df=df_order_cost, prefix='tables/cost-allocation/order', suffix=source_name)
def main(env): bucket = get_bucket_name(env) s3 = s3Toolkit(bucket) sql = ''' select * from sandbox_ana.sovendus_commission''' try: logger.info(f'Fetching {TABLE_NAME}') with DatabaseRedshift() as db: df = db.fetch(sql) if df.empty: logger.warning('Empty DataFrame') except Exception as e: logger.error(f'Fail to fetch {TABLE_NAME}. Exception: {e}') logger.info(f'Uploading to {S3_DESTINATION_KEY}') s3.upload_dataframe(df=df, delimiter=';', s3_key=S3_DESTINATION_KEY) logger.info('Upload suceeded')
def cost_allocator_awin(df_fo_source, env, days): """ Use to allocate cost for Awin and upload results to s3 """ # prepare dataset bucket = get_bucket_name(env) s3 = s3Toolkit(bucket) t = 'tables/funnel/awin' # logger.info(f'reading {t}') df_awin = download_and_read(prefix=t, days=days, s3=s3) # align mixed data types within the column df_awin.order_reference = df_awin.order_reference.apply(lambda x: str(x)) df_cpa = df_awin.loc[df_awin.order_reference.str.isdigit()].copy() # slicing for actual orders df_cpa.rename(columns={'transaction_commission_validation_date': 'cpa', 'date': 'awin_date'}, inplace=True) # dwh date sometimes is 1 or 2 days later than awin date, this step is to change date to dwh date df_fo_date = df_fo_source[['date','referenceorder']].copy() df_fo_date.referenceorder = df_fo_date.referenceorder.apply(lambda x: str(x)) df_cpa = df_cpa.merge(df_fo_date, how='inner', left_on='order_reference', right_on='referenceorder') df_cpa = df_cpa[['date', 'referenceorder', 'cpa']] # df_cpa_extra = df_awin.loc[df_awin.order_reference.str.contains('bonus')].copy() # slicing for bonus rows df_cpa_extra = df_awin.loc[~df_awin.order_reference.str.isdigit()].copy() # slicing for bonus rows df_orders = df_cpa.groupby(['date'], as_index=False)[['referenceorder']].count() # count number of orders per day df_bonus = df_cpa_extra.groupby(['date'], as_index=False)[['transaction_commission_validation_date']].sum() # sum bonus commission # distributing bonus commission to all the orders within the same day df_merged = df_orders.merge(df_bonus, how='inner') df_merged['cpa_extra'] = df_merged['transaction_commission_validation_date'] / df_merged['referenceorder'] df_order_cost = df_cpa.merge(df_merged[['date','cpa_extra']], how='left') df_order_cost.fillna(0, inplace=True) df_order_cost = df_order_cost[['date', 'referenceorder', 'cpa', 'cpa_extra']] df_order_cost['cpa_total'] = df_order_cost.cpa + df_order_cost.cpa_extra df_order_cost['data_source'] = 'awin' logger.info('uploading partitioned awin.csv') s3.partition_table_upload(df=df_order_cost, prefix='tables/cost-allocation/order', suffix='awin')
def main(env, days): bucket = get_bucket_name(env) s3 = s3Toolkit(bucket) sql = read_sql(filename='fact_ua_categories.sql', days=days) try: logger.info(f'Fetching {TABLE_NAME}') with DatabaseRedshift() as db: df = db.fetch(sql) if df.empty: logger.warning('Empty DataFrame') except Exception as e: logger.error(f'Fail to fetch {TABLE_NAME}. Exception: {e}') logger.info(f'Uploading to {S3_DESTINATION_KEY}') s3.partition_table_upload(df=df, prefix=S3_DESTINATION_KEY) logger.info('Upload succeeded, adding table partition') update_table_partition(env, table_name='fact_ua_categories')
def cost_allocator_sovendus(df_fo_source, env): """ Use to allocate cost for Sovendus and upload results to s3 Parameters ---------- df_fo_source: DataFrame DataFrame from fact_order_metrics after sliced for specific data source orders """ bucket = get_bucket_name(env) s3 = s3Toolkit(bucket) logger.info('reading sovendus-commission') df_svd = s3.read_csv_from_s3(s3_key='tables/redshift/sovendus-commission/sovendus-commission.csv') # find sovendus orders df_fo_svd = df_fo_source # if last date is null, fill current date today = datetime.strftime(datetime.today(), "%Y-%m-%d") df_svd.last_date.fillna(today, inplace=True) # join commission table on order table on brand country level # keep only the rows where date falls into the effective period logger.info('allocating cost') df_order_cost = df_fo_svd.merge(df_svd, on=['brandcode', 'countrycode'], how='left') df_order_cost = df_order_cost.loc[(df_order_cost.date >= df_order_cost.first_date) & (df_order_cost.date <= df_order_cost.last_date)].copy() df_order_cost['cpa'] = df_order_cost.gsii * df_order_cost.commission_rate df_order_cost['cpa_extra'] = 0 df_order_cost = df_order_cost[['date', 'referenceorder', 'cpa', 'cpa_extra']] df_order_cost['cpa_total'] = df_order_cost.cpa + df_order_cost.cpa_extra df_order_cost['data_source'] = 'sovendus' logger.info('uploading partitioned sovendus.csv') s3.partition_table_upload(df=df_order_cost, prefix='tables/cost-allocation/order', suffix='sovendus')
def main(env, days): bucket = get_bucket_name(env) s3 = s3Toolkit(bucket) # download/read files df_adwords_campaign = s3.read_csv_from_s3( s3_key='tables/funnel/dim-adwords-campaign/dim-adwords-campaign.csv') fo_cols = [ 'date', 'brandcode', 'countrycode', 'source', 'medium', 'channel_grouping', 'campaign', 'referenceorder', 'adwords_campaign_id', 'is_ga_tracked', 'gsiibc' ] df_fo = download_and_read(prefix='tables/redshift/fact-order-metrics', days=days, cols=fo_cols, s3=s3) adwords_cols = [ 'date', 'adwords_campaign_id', 'impressions', 'clicks', 'cost' ] df_adwords = download_and_read(prefix='tables/funnel/adwords', days=days, cols=adwords_cols, s3=s3) awin_cols = ['date', 'referenceorder', 'cpa_total'] df_awin = download_and_read(prefix='tables/cost-allocation/order', days=days, cols=awin_cols, s3=s3, suffix='awin') bing_cols = [ 'date', 'brandcode', 'countrycode', 'campaign', 'impressions', 'clicks', 'cost' ] df_bing = download_and_read(prefix='tables/funnel/bing', days=days, cols=bing_cols, s3=s3) criteo_cols = [ 'date', 'brandcode', 'countrycode', 'impressions', 'clicks', 'cost' ] df_criteo = download_and_read(prefix='tables/funnel/criteo', days=days, cols=criteo_cols, s3=s3) facebook_cols = ['date', 'referenceorder', 'cpa_total'] df_facebook = download_and_read(prefix='tables/cost-allocation/order', days=days, cols=facebook_cols, s3=s3, suffix='facebook') gsc_cols = ['date', 'brandcode', 'countrycode', 'impressions', 'clicks'] df_gsc = download_and_read(prefix='tables/funnel/gsc', days=days, cols=gsc_cols, s3=s3) ua_cat_cols = [ 'date', 'brandcode', 'countrycode', 'source', 'medium', 'channel_grouping', 'campaign', 'adwords_campaign_id', 'sessions' ] df_ua_cat = download_and_read(prefix='tables/redshift/fact-ua-categories', days=days, cols=ua_cat_cols, s3=s3) # process data df_fo = df_fo.merge( df_adwords_campaign[['adwords_campaign_id', 'cpc_channel']], how='left') df_fo = df_fo.merge(df_awin[['referenceorder', 'cpa_total']], how='left') df_fo = process_dwh(df_fo, is_fo=True) df_ua_cat = df_ua_cat.merge( df_adwords_campaign[['adwords_campaign_id', 'cpc_channel']], how='left') df_ua_cat = process_dwh(df_ua_cat, is_fo=False) df_adwords = df_adwords.merge(df_adwords_campaign, how='left') df_adwords_g = process_adwords(df_adwords) df_awin = df_awin.merge(df_fo[[ 'brandcode', 'countrycode', 'referenceorder', 'data_source', 'channel' ]], how='left') df_awin_g = process_awin(df_awin) df_bing_g = process_bing(df_bing) df_criteo_g = process_criteo(df_criteo) df_facebook = df_facebook.merge(df_fo[[ 'brandcode', 'countrycode', 'referenceorder', 'data_source', 'channel' ]], how='left') df_facebook_g = process_facebook(df_facebook) df_gsc_g = process_gsc(df_gsc) df_source_g = pd.concat([ df_adwords_g, df_awin_g, df_bing_g, df_criteo_g, df_facebook_g, df_gsc_g ]) df_fo_g = df_fo.groupby( ['date', 'brandcode', 'countrycode', 'data_source', 'channel'], as_index=False).agg({ 'referenceorder': 'count', 'gsiibc': 'sum' }) df_ua_cat_g = df_ua_cat.groupby( ['date', 'brandcode', 'countrycode', 'data_source', 'channel'], as_index=False)[['sessions']].sum() df_final = df_source_g.merge(df_fo_g, how='outer').merge(df_ua_cat_g, how='outer') s3.partition_table_upload(df=df_final, prefix='tables/reports/channel-report')
def funnel_wrapper(env, days, channel, func): """ A function wraps the similar read and write processes when handling funnel data. Parameters ---------- env: string Environment days: int Number of days we want to back fill. channel: string Name of the data source. func: function function to process the data from different data source. Returns ------- """ s3 = s3Toolkit(get_bucket_name(env)) date_range, keys = generate_days_keys(channel=channel, days=days) for key in keys: t0 = time.time() filename = key.split('/')[-1] path = os.path.join(DATA_DIR, filename) logger.info(f'Processing {key}') s3.download_file(s3_key=key, filename=path) try: rules = lambda x: x not in [ 'Connection_type_code', 'Connection_id', 'Currency' ] df_source = pd.read_csv(path, sep=',', usecols=rules) except Exception as e: logger.warning(f'Fail to read {filename}. Exception: {e}') df_source = pd.DataFrame() t1 = time.time() t_read = round(t1 - t0, 1) if len(df_source) == 0: logger.warning('Empty DataFrame!') continue logger.debug('Processing DataFrame') df_final = df_source.loc[df_source['Date'].between( date_range[0], date_range[1])].copy() df_final = func(df_final) t2 = time.time() t_processing = round(t2 - t1, 1) logger.debug('Uploading to S3') s3.partition_table_upload(df_final, prefix=f'tables/funnel/{channel}') t3 = time.time() t_upload = round(t3 - t2, 1) logger.info( f'Done. Download: {t_read}s, data process: {t_processing}s, upload: {t_upload}s, total: {round(t3 - t0, 1)}s' ) try: logger.debug('Removing file') os.remove(path) except Exception as e: logger.warning(f'Fail to remove {filename}. Exception: {e}')