Пример #1
0
def main(env, days):

    bucket = get_bucket_name(env)
    s3 = s3Toolkit(bucket)

    logger.info('reading fact_order_metrics')
    df_fo = download_and_read(prefix='tables/redshift/fact-order-metrics', days=days, s3=s3)

    cost_allocator_ppc(df_fo_source=fo_bing_filter(df_fo),
                       source_name='bing',
                       levels=['date', 'brandcode', 'countrycode', 'device'],
                       env=env, days=days)

    cost_allocator_ppc(df_fo_source=fo_facebook_filter(df_fo),
                       source_name='facebook',
                       levels=['date', 'brandcode'],
                       env=env, days=days)

    cost_allocator_ppc(df_fo_source=fo_adwords_filter(df_fo),
                       source_name='adwords',
                       levels=['date', 'adwords_campaign_id', 'device'],
                       env=env, days=days)

    cost_allocator_ppc(df_fo_source=fo_criteo_filter(df_fo),
                       source_name='criteo',
                       levels=['date', 'brandcode', 'countrycode'],
                       env=env, days=days)

    cost_allocator_sovendus(df_fo_source=fo_sovendus_filter(df_fo),
                            env=env)

    cost_allocator_awin(df_fo_source=df_fo, env=env, days=days)

    update_table_partition(env, table_name='order_cost')
Пример #2
0
def cost_allocator_ppc(df_fo_source, source_name, levels, env, days):
    """
    Use to allocate cost for ppc sources and upload results to s3

    Parameters
    ----------
    df_fo_source: DataFrame
        DataFrame from fact_order_metrics after sliced for specific data source orders
    source_name: string
        Name of the data source
    levels: list
        detail level that cost will be allocated on


    """
    folder_source = f'tables/funnel/{source_name}'

    bucket = get_bucket_name(env)
    s3 = s3Toolkit(bucket)

    # reading cost table
    logger.info(f'reading {folder_source}')
    df_source = download_and_read(prefix=folder_source, days=days,s3=s3)

    # allocating cost
    logger.info('allocating cost')
    df_order_cost = cost_allocation_logic_ppc(df_fo_source, df_source, levels, source_name)

    # uploading to s3
    logger.info(f'uploading partitioned {source_name}.csv')
    s3.partition_table_upload(df=df_order_cost, prefix='tables/cost-allocation/order', suffix=source_name)
Пример #3
0
def main(env):

    bucket = get_bucket_name(env)
    s3 = s3Toolkit(bucket)

    sql = ''' select * from sandbox_ana.sovendus_commission'''

    try:
        logger.info(f'Fetching {TABLE_NAME}')
        with DatabaseRedshift() as db:
            df = db.fetch(sql)
        if df.empty:
            logger.warning('Empty DataFrame')
    except Exception as e:
        logger.error(f'Fail to fetch {TABLE_NAME}. Exception: {e}')

    logger.info(f'Uploading to {S3_DESTINATION_KEY}')
    s3.upload_dataframe(df=df, delimiter=';', s3_key=S3_DESTINATION_KEY)
    logger.info('Upload suceeded')
Пример #4
0
def cost_allocator_awin(df_fo_source, env, days):
    """
    Use to allocate cost for Awin  and upload results to s3

    """
    # prepare dataset
    bucket = get_bucket_name(env)
    s3 = s3Toolkit(bucket)

    t = 'tables/funnel/awin'
    # logger.info(f'reading {t}')
    df_awin = download_and_read(prefix=t, days=days, s3=s3)

    # align mixed data types within the column
    df_awin.order_reference = df_awin.order_reference.apply(lambda x: str(x))

    df_cpa = df_awin.loc[df_awin.order_reference.str.isdigit()].copy()  # slicing for actual orders
    df_cpa.rename(columns={'transaction_commission_validation_date': 'cpa', 'date': 'awin_date'}, inplace=True)

    # dwh date sometimes is 1 or 2 days later than awin date, this step is to change date to dwh date
    df_fo_date = df_fo_source[['date','referenceorder']].copy()
    df_fo_date.referenceorder = df_fo_date.referenceorder.apply(lambda x: str(x))

    df_cpa = df_cpa.merge(df_fo_date, how='inner', left_on='order_reference', right_on='referenceorder')
    df_cpa = df_cpa[['date', 'referenceorder', 'cpa']]
    # df_cpa_extra = df_awin.loc[df_awin.order_reference.str.contains('bonus')].copy()  # slicing for bonus rows
    df_cpa_extra = df_awin.loc[~df_awin.order_reference.str.isdigit()].copy()  # slicing for bonus rows
    df_orders = df_cpa.groupby(['date'], as_index=False)[['referenceorder']].count()  # count number of orders per day
    df_bonus = df_cpa_extra.groupby(['date'], as_index=False)[['transaction_commission_validation_date']].sum()  # sum bonus commission

    # distributing bonus commission to all the orders within the same day
    df_merged = df_orders.merge(df_bonus, how='inner')
    df_merged['cpa_extra'] = df_merged['transaction_commission_validation_date'] / df_merged['referenceorder']

    df_order_cost = df_cpa.merge(df_merged[['date','cpa_extra']], how='left')
    df_order_cost.fillna(0, inplace=True)
    df_order_cost = df_order_cost[['date', 'referenceorder', 'cpa', 'cpa_extra']]
    df_order_cost['cpa_total'] = df_order_cost.cpa + df_order_cost.cpa_extra
    df_order_cost['data_source'] = 'awin'

    logger.info('uploading partitioned awin.csv')
    s3.partition_table_upload(df=df_order_cost, prefix='tables/cost-allocation/order', suffix='awin')
def main(env, days):

    bucket = get_bucket_name(env)
    s3 = s3Toolkit(bucket)

    sql = read_sql(filename='fact_ua_categories.sql', days=days)

    try:
        logger.info(f'Fetching {TABLE_NAME}')
        with DatabaseRedshift() as db:
            df = db.fetch(sql)
        if df.empty:
            logger.warning('Empty DataFrame')
    except Exception as e:
        logger.error(f'Fail to fetch {TABLE_NAME}. Exception: {e}')

    logger.info(f'Uploading to {S3_DESTINATION_KEY}')
    s3.partition_table_upload(df=df, prefix=S3_DESTINATION_KEY)
    logger.info('Upload succeeded, adding table partition')

    update_table_partition(env, table_name='fact_ua_categories')
Пример #6
0
def cost_allocator_sovendus(df_fo_source, env):
    """
    Use to allocate cost for Sovendus and upload results to s3

    Parameters
    ----------
    df_fo_source: DataFrame
        DataFrame from fact_order_metrics after sliced for specific data source orders


    """
    bucket = get_bucket_name(env)
    s3 = s3Toolkit(bucket)

    logger.info('reading sovendus-commission')
    df_svd = s3.read_csv_from_s3(s3_key='tables/redshift/sovendus-commission/sovendus-commission.csv')

    # find sovendus orders
    df_fo_svd = df_fo_source

    # if last date is null, fill current date
    today = datetime.strftime(datetime.today(), "%Y-%m-%d")
    df_svd.last_date.fillna(today, inplace=True)

    # join commission table on order table on brand country level
    # keep only the rows where date falls into the effective period
    logger.info('allocating cost')
    df_order_cost = df_fo_svd.merge(df_svd, on=['brandcode', 'countrycode'], how='left')
    df_order_cost = df_order_cost.loc[(df_order_cost.date >= df_order_cost.first_date) & (df_order_cost.date <= df_order_cost.last_date)].copy()
    df_order_cost['cpa'] = df_order_cost.gsii * df_order_cost.commission_rate
    df_order_cost['cpa_extra'] = 0
    df_order_cost = df_order_cost[['date', 'referenceorder', 'cpa', 'cpa_extra']]
    df_order_cost['cpa_total'] = df_order_cost.cpa + df_order_cost.cpa_extra
    df_order_cost['data_source'] = 'sovendus'

    logger.info('uploading partitioned sovendus.csv')
    s3.partition_table_upload(df=df_order_cost, prefix='tables/cost-allocation/order', suffix='sovendus')
Пример #7
0
def main(env, days):
    bucket = get_bucket_name(env)
    s3 = s3Toolkit(bucket)

    # download/read files
    df_adwords_campaign = s3.read_csv_from_s3(
        s3_key='tables/funnel/dim-adwords-campaign/dim-adwords-campaign.csv')

    fo_cols = [
        'date', 'brandcode', 'countrycode', 'source', 'medium',
        'channel_grouping', 'campaign', 'referenceorder',
        'adwords_campaign_id', 'is_ga_tracked', 'gsiibc'
    ]
    df_fo = download_and_read(prefix='tables/redshift/fact-order-metrics',
                              days=days,
                              cols=fo_cols,
                              s3=s3)

    adwords_cols = [
        'date', 'adwords_campaign_id', 'impressions', 'clicks', 'cost'
    ]
    df_adwords = download_and_read(prefix='tables/funnel/adwords',
                                   days=days,
                                   cols=adwords_cols,
                                   s3=s3)

    awin_cols = ['date', 'referenceorder', 'cpa_total']
    df_awin = download_and_read(prefix='tables/cost-allocation/order',
                                days=days,
                                cols=awin_cols,
                                s3=s3,
                                suffix='awin')

    bing_cols = [
        'date', 'brandcode', 'countrycode', 'campaign', 'impressions',
        'clicks', 'cost'
    ]
    df_bing = download_and_read(prefix='tables/funnel/bing',
                                days=days,
                                cols=bing_cols,
                                s3=s3)

    criteo_cols = [
        'date', 'brandcode', 'countrycode', 'impressions', 'clicks', 'cost'
    ]
    df_criteo = download_and_read(prefix='tables/funnel/criteo',
                                  days=days,
                                  cols=criteo_cols,
                                  s3=s3)

    facebook_cols = ['date', 'referenceorder', 'cpa_total']
    df_facebook = download_and_read(prefix='tables/cost-allocation/order',
                                    days=days,
                                    cols=facebook_cols,
                                    s3=s3,
                                    suffix='facebook')

    gsc_cols = ['date', 'brandcode', 'countrycode', 'impressions', 'clicks']
    df_gsc = download_and_read(prefix='tables/funnel/gsc',
                               days=days,
                               cols=gsc_cols,
                               s3=s3)

    ua_cat_cols = [
        'date', 'brandcode', 'countrycode', 'source', 'medium',
        'channel_grouping', 'campaign', 'adwords_campaign_id', 'sessions'
    ]
    df_ua_cat = download_and_read(prefix='tables/redshift/fact-ua-categories',
                                  days=days,
                                  cols=ua_cat_cols,
                                  s3=s3)

    # process data
    df_fo = df_fo.merge(
        df_adwords_campaign[['adwords_campaign_id', 'cpc_channel']],
        how='left')
    df_fo = df_fo.merge(df_awin[['referenceorder', 'cpa_total']], how='left')
    df_fo = process_dwh(df_fo, is_fo=True)

    df_ua_cat = df_ua_cat.merge(
        df_adwords_campaign[['adwords_campaign_id', 'cpc_channel']],
        how='left')
    df_ua_cat = process_dwh(df_ua_cat, is_fo=False)

    df_adwords = df_adwords.merge(df_adwords_campaign, how='left')
    df_adwords_g = process_adwords(df_adwords)

    df_awin = df_awin.merge(df_fo[[
        'brandcode', 'countrycode', 'referenceorder', 'data_source', 'channel'
    ]],
                            how='left')
    df_awin_g = process_awin(df_awin)

    df_bing_g = process_bing(df_bing)

    df_criteo_g = process_criteo(df_criteo)

    df_facebook = df_facebook.merge(df_fo[[
        'brandcode', 'countrycode', 'referenceorder', 'data_source', 'channel'
    ]],
                                    how='left')
    df_facebook_g = process_facebook(df_facebook)

    df_gsc_g = process_gsc(df_gsc)

    df_source_g = pd.concat([
        df_adwords_g, df_awin_g, df_bing_g, df_criteo_g, df_facebook_g,
        df_gsc_g
    ])

    df_fo_g = df_fo.groupby(
        ['date', 'brandcode', 'countrycode', 'data_source', 'channel'],
        as_index=False).agg({
            'referenceorder': 'count',
            'gsiibc': 'sum'
        })
    df_ua_cat_g = df_ua_cat.groupby(
        ['date', 'brandcode', 'countrycode', 'data_source', 'channel'],
        as_index=False)[['sessions']].sum()

    df_final = df_source_g.merge(df_fo_g, how='outer').merge(df_ua_cat_g,
                                                             how='outer')

    s3.partition_table_upload(df=df_final,
                              prefix='tables/reports/channel-report')
Пример #8
0
def funnel_wrapper(env, days, channel, func):
    """
    A function wraps the similar read and write processes when handling funnel data.

    Parameters
    ----------
    env: string
        Environment
    days: int
        Number of days we want to back fill.
    channel: string
        Name of the data source.
    func: function
        function to process the data from different data source.

    Returns
    -------

    """

    s3 = s3Toolkit(get_bucket_name(env))
    date_range, keys = generate_days_keys(channel=channel, days=days)

    for key in keys:
        t0 = time.time()
        filename = key.split('/')[-1]
        path = os.path.join(DATA_DIR, filename)
        logger.info(f'Processing {key}')
        s3.download_file(s3_key=key, filename=path)
        try:
            rules = lambda x: x not in [
                'Connection_type_code', 'Connection_id', 'Currency'
            ]
            df_source = pd.read_csv(path, sep=',', usecols=rules)
        except Exception as e:
            logger.warning(f'Fail to read {filename}. Exception: {e}')
            df_source = pd.DataFrame()

        t1 = time.time()
        t_read = round(t1 - t0, 1)

        if len(df_source) == 0:
            logger.warning('Empty DataFrame!')
            continue

        logger.debug('Processing DataFrame')
        df_final = df_source.loc[df_source['Date'].between(
            date_range[0], date_range[1])].copy()
        df_final = func(df_final)
        t2 = time.time()
        t_processing = round(t2 - t1, 1)

        logger.debug('Uploading to S3')
        s3.partition_table_upload(df_final, prefix=f'tables/funnel/{channel}')
        t3 = time.time()
        t_upload = round(t3 - t2, 1)

        logger.info(
            f'Done. Download: {t_read}s, data process: {t_processing}s, upload: {t_upload}s, total: {round(t3 - t0, 1)}s'
        )
        try:
            logger.debug('Removing file')
            os.remove(path)
        except Exception as e:
            logger.warning(f'Fail to remove {filename}. Exception: {e}')