def load_page_data(from_cache=True): logger.debug("Loading data for app_tests ...") global df_demand, df_active_alternatives, df_baseline_vs_tests df_demand = demand_data(spark,from_cache) df_active_alternatives = active_tests_results(spark,from_cache) df_baseline_vs_tests = baseline_versus_tests(spark,from_cache)
def update_selected_experiment(selected_city,selected_series_type,start_date,end_date,selected_experiment): logger.debug("Updating selected experiment: " + str(selected_experiment) + " selected_city: " + str(selected_city) + " start_date: " + start_date + " end_date: " + end_date + " selected_series_type: " + selected_series_type) #Setting filters filters = {} if selected_city != -1: filters = {'city':selected_city} df_exp_agg = get_experiment_agg_data(selected_experiment, start_date, end_date, filters, 'date, alternative_id') # x_range, y_range_reta, y_range, conversion, price, arpu, elasticity_label = elasticity_for_experiment(selected_experiment, initial_window_date = start_date, base_date = end_date, field = selected_series_type) # elasticity_plot = plot_pef_elasticity(x_range, y_range_reta, y_range, conversion, price, arpu) fig_arpu,fig_ratio,fig_ticket,fig_conversion = plot_experiment_comparison(df_exp_agg, BASE_FIELDS[selected_series_type]['fields']) for fig in [fig_arpu]: format_figure(fig,showlegend=True) for fig in [fig_ratio,fig_ticket,fig_conversion]: format_figure(fig) print("Updating selected experiment: " + str(selected_experiment) + " FINISH") arpu_label = 'ARPU - ' + BASE_FIELDS[selected_series_type]['label'] ticket_label = 'Ticket Médio - ' + BASE_FIELDS[selected_series_type]['label'] conversion_label = 'Conversão (pagos/registered) - ' + BASE_FIELDS[selected_series_type]['label'] # elasticity_label = 'Elasticidade PEF - ' + elasticity_label return fig_arpu,fig_ratio,fig_ticket,fig_conversion,arpu_label,ticket_label,conversion_label
def load_daily_share_rpu(spark, from_cache=False): logger.debug("Loading daily_share_rpu ... ") query = "SELECT * FROM data_science.consolidated_rpu_per_day WHERE level='Graduação'" df = load_from_db_cache(spark, query, 'consolidated_rpu_per_day', from_cache) #Pricing log pricing_log = { "RECIFE": [-15, 14], "CURITIBA": [-10, 14], "RIO DE JANEIRO": [-10, 14], "JOÃO PESSOA": [-20, 14], "BELEM": [-5, 14], "SALVADOR": [-10, 14], "FORTALEZA": [-10, 14], "MANAUS": [10, 14], "PORTO VELHO": [20, 14], "CARUARU": [-50, 14], "FEIRA DE SANTANA": [-30, 14], "TERESINA": [-10, 14], "CAMPINAS": [-15, 14], "OSASCO": [-20, 14], "SÃO JOSÉ DOS CAMPOS": [-10, 14], "SOROCABA": [-30, 14], "SANTO ANDRE": [10, 14], "JUAZEIRO DO NORTE": [-10, 37], "CUIABA": [-20, 42], "BOA VISTA": [-20, 42] } if from_cache: #Simulating pricing logs df['pricing_change'] = np.nan df['pricing_log'] = np.nan for key in pricing_log: df.loc[(df['dia_ano'] == pricing_log[key][1]) & (df['city'] == key), 'pricing_change'] = pricing_log[key][0] df.loc[(df['dia_ano'] == pricing_log[key][1]) & ( df['city'] == key ), 'pricing_log'] = "Alteração de {}% para elevar ARPU e captação".format( pricing_log[key][0]) df['delta_rpu_20_19'] = (df['receita_acumulada'] / df['ordens_acumuladas'] ) / (df['receita_acumulada_18'] / df['ordens_acumuladas_18']) - 1 df['delta_pagos_20_19'] = df['pagos_acumulados'] / df[ 'pagos_acumulados_18'] - 1 #Excluding cities where the deltas are not defined df = df[(df['delta_rpu_20_19'] != np.inf) & (df['delta_pagos_20_19'] != np.inf)] df = df.sort_values(['city', 'dia_ano']) return df
def load_page_data(from_cache=True): logger.debug("Loading data for app_goals ...") global df_goals_only_qb_time_series, df_ies_categorization, df_campaigns, df_goals_only_qb_time_series_summer df_goals_only_qb_time_series = rpu_goals_only_qb_time_series( spark, from_cache) df_ies_categorization = ies_categorization(spark, from_cache) df_campaigns = campaigns(spark, from_cache) df_goals_only_qb_time_series_summer = rpu_goals_only_qb_time_series_summer( spark, from_cache)
def update_city_dropdown(selected_experiment,start_date,end_date): logger.debug("Updating city dropdown: " + str(selected_experiment)) aux = get_experiment_agg_data_per_city(selected_experiment, start_date, end_date) aux = aux[aux['order_id']>MIN_N_POINTS] options = [{'label':'All','value':-1}] for city,n_orders in zip(aux['city'].tolist(),aux['order_id'].tolist()): options.append({ 'label':city + ' (' + str(n_orders) + ' orders)' , 'value':city }) logger.debug("Updating city dropdown: " + str(selected_experiment) + " FINISHED") return options,-1
def log_recalculate(spark, from_cache=False): logger.debug("Loading log_recalculate ... ") query = """ with mudancas as ( select * from parcerias.log_changes_pricing ), base as ( select datas.dia, modalidade.kind, campus.university_id, campus.city, campus.state, campus.ies from (select distinct university_id, city, state, ies from parcerias.log_changes_pricing) as campus cross join (select explode(sequence(date('2019-12-01'), date(now()))) dia) as datas cross join (select distinct case when parent_id = 1 then 'Presencial' else 'EaD + Semi' end kind from querobolsa_production.kinds where kinds.parent_id is not null) as modalidade ) select base.*, coalesce(mudancas.qtde,0.0) as qtde from base left join mudancas on base.dia = mudancas.dia and base.kind = mudancas.kind and base.university_id = mudancas.university_id and base.city = mudancas.city and base.state = mudancas.state order by base.dia, base.kind, mudancas.qtde """ df = load_from_db_cache(spark, query, 'log_recalculate', from_cache) #df['university_id'] = df['university_id'].astype(int) # NUMBER OF IES PER RELEVANT CITIES #df['revenue_city'] = df.groupby(['dia','city','state','kind'])['qtde'].transform('sum') return df
def demand_data(spark, from_cache=False): logger.debug("Loading pricing updates ... ") query = """ SELECT * FROM data_science.base_ordens_experimentos WHERE base_ordens_experimentos.registered_at BETWEEN '2020-01-01' AND '2020-10-01' AND offered_price <> 0 AND origin IN ('Quero Bolsa') """ df = load_from_db_cache(spark, query, 'demand_data', from_cache) return df
def load_daily_pef(spark, from_cache=False): logger.debug("Loading consolidated_pef_per_day ... ") query = """SELECT date, city, case when name = 'Presencial' then 'Presencial' else 'EaD + Semi' end kind, avg(value) as value, avg(original_value) as original_value, avg(pef_desconto) as pef_desconto FROM data_science.consolidated_pef_per_day GROUP BY 1,2,3""" #WHERE name ='Presencial'" df = load_from_db_cache(spark, query, 'consolidated_pef_per_day', from_cache) df = df.sort_values(['city', 'date']) return df
def updates_per_day(spark, from_cache=False): logger.debug("Loading pricing updates ... ") query = """ select * from ( select dia, origem, ies, city, state, kind, qtde, row_number() over (partition by ies, city, kind order by qtde desc) as ranking from parcerias.log_changes_pricing ) dd """ df = load_from_db_cache(spark, query, 'pricing_updates', from_cache) return df
def update_selected_alternative(alternative_selection,selected_city,start_date,end_date): print('update_selected_alternative: ' + str(alternative_selection)) number_of_outputs = 6 if alternative_selection is None: return number_of_outputs*(blank_fig(ROW_HEIGHTS[2]),) #Setting filters filters = {} if selected_city != -1: filters = {'city':selected_city} start = time.process_time() #Setting BayesianABTest df_cum_results = get_alternative_agg_data(alternative_selection, start_date, end_date, filters) # hist_data = { # 'conversion':{ # 'mean': .07, # 'std': .2 # }, # 'aov':{ # 'mean': 500, # 'std':100 # } # } hist_data = None unique_alternatives = df_cum_results['alternative'].unique().tolist() abtest = BayesianABTest('arpu',unique_alternatives, hist_data = hist_data) #Feeding data to the test for alternative in unique_alternatives: last_data = df_cum_results.groupby('alternative').agg('last').reset_index() n_visits = last_data[last_data['alternative']==alternative]['n_visits'].values[0] n_paids = last_data[last_data['alternative']==alternative]['n_paids'].values[0] revenue = last_data[last_data['alternative']==alternative]['revenue'].values[0] arpu = last_data[last_data['alternative']==alternative]['arpu'].values[0] abtest.feed_alternative_data( alternative, n_visits=n_visits, n_paids=n_paids, revenue = revenue ) fig_arpu = abtest.plot_results(plotly=True) fig_prob2beat,fig_expLoss = abtest.plot_cumulative_results(df_cum_results,plotly=True) legend_colors = {} for i in range(len(fig_arpu.data)): legend_colors[fig_arpu.data[i]['legendgroup']] = fig_arpu.data[i]['marker']['color'] format_figure(fig_arpu,showlegend=True,height = ROW_HEIGHTS[2]) #Formatting and setting the same legend colors for fig in [fig_prob2beat,fig_expLoss]: format_figure(fig,showlegend=False, height = ROW_HEIGHTS[2]) for i in range(len(fig.data)): label = fig.data[i]['legendgroup'].replace('variable=','') fig.data[i]['line']['color'] = legend_colors[label] logger.debug("Time to run BayesianABTest: " + str(time.process_time() - start)) start = time.process_time() #Comparative plots fig_offered_price = plot_offered_price_alternative(df_cum_results,legend_colors) fig_price = plot_price_discount_alternative(df_cum_results) fig_customer_count = plot_customer_count(df_cum_results,legend_colors) logger.debug("Time to BayesianABTest plots: " + str(time.process_time() - start)) return fig_arpu,fig_prob2beat,fig_expLoss,fig_offered_price,fig_price,fig_customer_count
def daily_order_pef(spark, from_cache=False): logger.debug("Loading daily_order_pef ... ") query = """ WITH city_sales AS ( SELECT DISTINCT sales.campus_city AS city, sales.campus_state AS state, round(sum(sales.total_revenue)) AS sales FROM data_warehouse.sales LEFT JOIN querobolsa_production.coupons ON coupons.id = sales.coupon_id LEFT JOIN querobolsa_production.offers ON offers.id = coupons.offer_id LEFT JOIN querobolsa_production.university_offers ON university_offers.id = offers.university_offer_id LEFT JOIN (SELECT * FROM querobolsa_production.kinds WHERE parent_id IS NOT NULL) kinds ON kinds.name = sales.course_kind LEFT JOIN (SELECT * FROM querobolsa_production.levels WHERE parent_id IS NOT NULL) levels ON levels.name = sales.course_level WHERE university_offers.enrollment_semester IN ('2019.1','2019.2','2020.1') AND sales.campus_city IS NOT NULL AND sales.campus_city <> '' GROUP BY 1,2 ORDER BY 3 DESC ), top_in_state AS ( SELECT city_sales.* FROM (SELECT state, max(sales) AS max_sales FROM city_sales GROUP BY 1 ) AS ref JOIN city_sales ON city_sales.state = ref.state AND city_sales.sales = ref.max_sales ), top_cities AS ( SELECT * FROM city_sales ORDER BY sales DESC LIMIT 40 ), cidades_alvo AS( SELECT * FROM top_cities UNION SELECT * FROM top_in_state ORDER BY sales DESC ) SELECT DATE(orders.registered_at) AS date, campuses.city, case when k.parent_id = 1 then 'Presencial' else 'EaD + Semi' end kind, AVG(orders.price) AS value FROM querobolsa_production.orders JOIN querobolsa_production.line_items ON orders.id = line_items.order_id JOIN querobolsa_production.pre_enrollment_fees ON pre_enrollment_fees.id = line_items.pre_enrollment_fee_id JOIN querobolsa_production.offers ON line_items.offer_id = offers.id JOIN querobolsa_production.courses ON offers.course_id = courses.id JOIN querobolsa_production.campuses ON campuses.id = courses.campus_id JOIN querobolsa_production.levels l ON courses.level = l.name AND l.parent_id IS NOT NULL JOIN querobolsa_production.levels ON l.parent_id = levels.id JOIN querobolsa_production.kinds k ON courses.kind = k.name AND k.parent_id IS NOT NULL JOIN querobolsa_production.kinds ON k.parent_id = kinds.id JOIN cidades_alvo ON campuses.city = cidades_alvo.city WHERE orders.checkout_step NOT IN ('initiated') AND l.parent_id = 1 AND orders.registered_at BETWEEN '2019-12-12' AND '2020-04-01' GROUP BY 1,2,3 ORDER BY 1 """ df = load_from_db_cache(spark, query, 'daily_order_pef', from_cache) return df
def baseline_versus_tests(spark, from_cache=False): logger.debug("Loading pricing updates ... ") query = """ WITH base AS( SELECT DATE(base_ordens_experimentos.registered_at) AS date, fee_experiment_id, -- alternative, CASE WHEN alternative = 'baseline' THEN 'baseline' ELSE 'testes' END AS alternative_kind, SUM(CASE WHEN orders.checkout_step = 'paid' THEN 1 ELSE 0 END) AS paids, COUNT(DISTINCT customer_id) AS customers, SUM(CASE WHEN orders.checkout_step = 'paid' THEN 1 ELSE 0 END)/COUNT(DISTINCT customer_id) AS conversao, SUM(CASE WHEN orders.checkout_step = 'paid' THEN orders.price ELSE 0 END)/SUM(CASE WHEN orders.checkout_step = 'paid' THEN 1 ELSE 0 END) AS ticket_medio, (SUM(CASE WHEN orders.checkout_step = 'paid' THEN orders.price ELSE 0 END) + SUM(coalesce(sales.ltv_qp,0)))/SUM(CASE WHEN orders.checkout_step = 'paid' THEN 1 ELSE 0 END) AS ticket_medio_ltv, SUM(orders.price)/COUNT(DISTINCT customer_id) AS ticket_customer, SUM(CASE WHEN orders.checkout_step = 'paid' THEN orders.price ELSE 0 END) AS revenue, SUM(CASE WHEN orders.checkout_step = 'paid' THEN orders.price ELSE 0 END) + SUM(coalesce(sales.ltv_qp,0)) AS receita_com_ltv, (SUM(CASE WHEN orders.checkout_step = 'paid' THEN orders.price ELSE 0 END) + SUM(coalesce(sales.ltv_qp,0)))/COUNT(DISTINCT customer_id) AS rpu, SUM(CASE WHEN orders.checkout_step = 'paid' THEN orders.price ELSE 0 END)/COUNT(DISTINCT customer_id) AS rpu_sem_ltv FROM data_science.base_ordens_experimentos LEFT JOIN data_warehouse.sales ON base_ordens_experimentos.order_id = sales.order_id JOIN querobolsa_production.orders ON base_ordens_experimentos.order_id = orders.id JOIN querobolsa_production.line_items ON orders.id = line_items.order_id JOIN querobolsa_production.offers ON offers.id = line_items.offer_id JOIN querobolsa_production.courses ON courses.id = offers.course_id JOIN querobolsa_production.kinds k ON k.name = courses.kind JOIN querobolsa_production.kinds ON k.parent_id = kinds.id JOIN querobolsa_production.levels l ON l.name = courses.level JOIN querobolsa_production.levels ON l.parent_id = levels.id WHERE base_ordens_experimentos.registered_at BETWEEN '2020-04-01' AND current_date - interval 2 days AND origin IN ('Quero Bolsa') GROUP BY 1,2,3 ORDER BY 1,2 ), base_limpa AS ( SELECT date, fee_experiment_id, alternative_kind, paids, customers, receita_com_ltv FROM base ORDER BY 1,alternative_kind ), base_evolutivo AS( SELECT date, fee_experiment_id, SUM(CASE WHEN alternative_kind = 'baseline' THEN paids ELSE 0 END) AS baseline_paids, SUM(CASE WHEN alternative_kind = 'baseline' THEN customers ELSE 0 END) AS baseline_customers, SUM(CASE WHEN alternative_kind = 'baseline' THEN receita_com_ltv ELSE 0 END) AS baseline_revenue, SUM(CASE WHEN alternative_kind = 'testes' THEN paids ELSE 0 END) AS testes_paids, SUM(CASE WHEN alternative_kind = 'testes' THEN customers ELSE 0 END) AS testes_customers, SUM(CASE WHEN alternative_kind = 'testes' THEN receita_com_ltv ELSE 0 END) AS testes_revenue FROM base_limpa GROUP BY 1,2 ), base_consolidada AS ( SELECT date, fee_experiment_id, -- Acumulado SUM(baseline_customers) OVER ( PARTITION BY fee_experiment_id ORDER BY date ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW) AS baseline_customers_ac, SUM(baseline_paids) OVER ( PARTITION BY fee_experiment_id ORDER BY date ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW) AS baseline_paids_ac, SUM(baseline_revenue) OVER ( PARTITION BY fee_experiment_id ORDER BY date ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW) AS baseline_revenue_ac, SUM(testes_customers) OVER ( PARTITION BY fee_experiment_id ORDER BY date ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW) AS testes_customers_ac, SUM(testes_paids) OVER ( PARTITION BY fee_experiment_id ORDER BY date ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW) AS testes_paids_ac, SUM(testes_revenue) OVER ( PARTITION BY fee_experiment_id ORDER BY date ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW) AS testes_revenue_ac, -- Média Móvel AVG(baseline_customers) OVER ( PARTITION BY fee_experiment_id ORDER BY date ROWS BETWEEN 6 PRECEDING AND CURRENT ROW) AS baseline_customers_mm, AVG(baseline_paids) OVER ( PARTITION BY fee_experiment_id ORDER BY date ROWS BETWEEN 6 PRECEDING AND CURRENT ROW) AS baseline_paids_mm, AVG(baseline_revenue) OVER ( PARTITION BY fee_experiment_id ORDER BY date ROWS BETWEEN 6 PRECEDING AND CURRENT ROW) AS baseline_revenue_mm, AVG(testes_customers) OVER ( PARTITION BY fee_experiment_id ORDER BY date ROWS BETWEEN 6 PRECEDING AND CURRENT ROW) AS testes_customers_mm, AVG(testes_paids) OVER ( PARTITION BY fee_experiment_id ORDER BY date ROWS BETWEEN 6 PRECEDING AND CURRENT ROW) AS testes_paids_mm, AVG(testes_revenue) OVER ( PARTITION BY fee_experiment_id ORDER BY date ROWS BETWEEN 6 PRECEDING AND CURRENT ROW) AS testes_revenue_mm FROM base_evolutivo ), base_final AS( SELECT date, fee_experiment_id, baseline_revenue_ac, testes_revenue_ac, baseline_customers_ac, testes_customers_ac, baseline_paids_ac, testes_paids_ac, baseline_revenue_ac/baseline_customers_ac AS rpu_baseline_ac, testes_revenue_ac/testes_customers_ac AS rpu_testes_ac, baseline_paids_ac/baseline_customers_ac AS conversao_baseline_ac, testes_paids_ac/testes_customers_ac AS conversao_testes_ac, baseline_revenue_mm/baseline_customers_mm AS rpu_baseline_mm, testes_revenue_mm/testes_customers_mm AS rpu_testes_mm, baseline_paids_mm/baseline_customers_mm AS conversao_baseline_mm, testes_paids_mm/testes_customers_mm AS conversao_testes_mm FROM base_consolidada WHERE fee_experiment_id IN (56,57,58,59,60,61,62,63,65) ) SELECT *, round(rpu_testes_ac/rpu_baseline_ac-1,2) AS rpu_gain, round(conversao_testes_ac/conversao_baseline_ac-1,2) AS conversion_gain FROM base_final """ df = load_from_db_cache(spark, query, 'baseline_versus_tests', from_cache) return df
def active_tests_results(spark, from_cache=False): logger.debug("Loading pricing updates ... ") query = """ WITH alternatives_per_day AS( SELECT date, experiments_aggregate_base.fee_experiment_id, collect_set(alternative_id) AS alternative_ids FROM data_science.experiments_aggregate_base WHERE date >= '2020-04-01' AND alternative_ratio IS NOT NULL GROUP BY 1,2 ), active_alternatives( SELECT fee_experiment_id, alternative_ids FROM alternatives_per_day WHERE date = date_sub(current_date(),3) ), test_start AS( SELECT alternatives_per_day.fee_experiment_id, min(alternatives_per_day.date) AS date FROM alternatives_per_day JOIN active_alternatives ON active_alternatives.alternative_ids = alternatives_per_day.alternative_ids GROUP BY 1 ), base_resultados AS( SELECT test_start.date AS test_start_date, base_ordens_experimentos.fee_experiment_id, alternative, CASE WHEN alternative = 'seasonality minus 25' THEN 'c' ELSE CASE WHEN alternative = 'seasonality lowest' THEN 'd' ELSE CASE WHEN alternative = 'seasonality minus 75' THEN 'e' ELSE alternative END END END AS ordem, SUM(CASE WHEN orders.checkout_step = 'paid' THEN 1 ELSE 0 END) AS paids, COUNT(DISTINCT customer_id) AS customers, SUM(CASE WHEN orders.checkout_step = 'paid' THEN 1 ELSE 0 END)/COUNT(DISTINCT customer_id) AS conversao, SUM(CASE WHEN orders.checkout_step = 'paid' THEN orders.price ELSE 0 END)/SUM(CASE WHEN orders.checkout_step = 'paid' THEN 1 ELSE 0 END) AS ticket_medio, (SUM(CASE WHEN orders.checkout_step = 'paid' THEN orders.price ELSE 0 END) + SUM(coalesce(sales.ltv_qp,0)))/SUM(CASE WHEN orders.checkout_step = 'paid' THEN 1 ELSE 0 END) AS ticket_medio_ltv, SUM(orders.price)/COUNT(DISTINCT customer_id) AS ticket_customer, SUM(CASE WHEN orders.checkout_step = 'paid' THEN orders.price ELSE 0 END) AS revenue, SUM(CASE WHEN orders.checkout_step = 'paid' THEN orders.price ELSE 0 END) + SUM(coalesce(sales.ltv_qp,0)) AS receita_com_ltv, (SUM(CASE WHEN orders.checkout_step = 'paid' THEN orders.price ELSE 0 END) + SUM(coalesce(sales.ltv_qp,0)))/COUNT(DISTINCT customer_id) AS rpu, SUM(CASE WHEN orders.checkout_step = 'paid' THEN orders.price ELSE 0 END)/COUNT(DISTINCT customer_id) AS rpu_sem_ltv, AVG(base_ordens_experimentos.offered_price) AS offered_price FROM data_science.base_ordens_experimentos LEFT JOIN data_warehouse.sales ON base_ordens_experimentos.order_id = sales.order_id JOIN querobolsa_production.orders ON base_ordens_experimentos.order_id = orders.id JOIN querobolsa_production.line_items ON orders.id = line_items.order_id JOIN querobolsa_production.offers ON offers.id = line_items.offer_id JOIN querobolsa_production.courses ON courses.id = offers.course_id JOIN querobolsa_production.kinds k ON k.name = courses.kind JOIN querobolsa_production.kinds ON k.parent_id = kinds.id JOIN querobolsa_production.levels l ON l.name = courses.level JOIN querobolsa_production.levels ON l.parent_id = levels.id JOIN test_start ON test_start.fee_experiment_id = base_ordens_experimentos.fee_experiment_id AND base_ordens_experimentos.registered_at >= test_start.date WHERE origin IN ('Quero Bolsa') GROUP BY 1,2,3 ORDER BY 1,2 ) SELECT * FROM base_resultados WHERE customers > 50 ORDER BY fee_experiment_id """ df = load_from_db_cache(spark, query, 'active_tests_results', from_cache) return df
def city2ies(spark, from_cache=False): logger.debug("Loading city2ies ... ") query = """ WITH city_sales as ( select distinct campuses.city_id, sales.campus_city as city, sales.campus_state as state, kinds.parent_id as kind_id, round(sum(sales.total_revenue)) as sales from data_warehouse.sales left join querobolsa_production.coupons on coupons.id = sales.coupon_id left join querobolsa_production.offers on offers.id = coupons.offer_id left join querobolsa_production.university_offers on university_offers.id = offers.university_offer_id left join (select * from querobolsa_production.kinds where parent_id is not null) kinds on kinds.name = sales.course_kind left join (select * from querobolsa_production.levels where parent_id is not null) levels on levels.name = sales.course_level left join querobolsa_production.campuses on campuses.id = sales.campus_id where university_offers.enrollment_semester in ('2019.1','2019.2','2020.1') and sales.campus_city is not null and sales.campus_city <> '' group by 1,2,3,4 order by 5 desc ), top_in_state as ( select city_sales.* from (select kind_id, state, max(sales) as max_sales from city_sales group by 1,2 ) as ref join city_sales on city_sales.state = ref.state and city_sales.sales = ref.max_sales and ref.kind_id = city_sales.kind_id ), top_cities as ( select * from city_sales order by sales desc limit 40 ), filter_cities as ( select * from top_cities union select * from top_in_state order by sales desc ) select distinct campuses.city_id, sales.campus_city as city, sales.campus_state as state, case when kinds.id in (3,8) then 'EaD + Semi' else 'Presencial' end kind, levels.name AS level, offers.university_id, universities.name, sum(sales.total_revenue) as revenue from data_warehouse.sales left join querobolsa_production.coupons on coupons.id = sales.coupon_id left join querobolsa_production.offers on offers.id = coupons.offer_id left join querobolsa_production.universities on universities.id = offers.university_id left join querobolsa_production.university_offers on university_offers.id = offers.university_offer_id left join querobolsa_production.courses on sales.course_id = courses.id left join querobolsa_production.kinds k ON k.name = sales.course_kind AND k.parent_id IS NOT NULL left join querobolsa_production.kinds ON k.parent_id = kinds.id left join querobolsa_production.levels l ON l.name = sales.course_level AND l.parent_id IS NOT NULL left join querobolsa_production.levels ON l.parent_id = levels.id join filter_cities on filter_cities.city = sales.campus_city and filter_cities.state = sales.campus_state and filter_cities.kind_id = kinds.id left join querobolsa_production.campuses on campuses.id = sales.campus_id where sales.payment_date BETWEEN '2019-10-01' AND '2020-04-01' and offers.university_id is not null and levels.id = 1 and kinds.id IN (1,3,8) group by 1,2,3,4,5,6,7 order by 8 desc """ df = load_from_db_cache(spark, query, 'city2ies', from_cache) df['university_id'] = df['university_id'].astype(int) # NUMBER OF IES PER RELEVANT CITIES df['revenue_city'] = df.groupby(['city', 'state', 'kind'])['revenue'].transform('sum') df['relevance_city'] = df['revenue'] / df['revenue_city'] df['cumrelevance_city'] = df.groupby( ['city', 'state', 'kind'])['relevance_city'].transform(lambda x: x.cumsum()) # df[df['city']=='Brasília'].sort_values('revenue',ascending=False) df = df.sort_values(['revenue_city', 'revenue'], ascending=False) #Getting only IES that are in 80% of relevance df = df[df['cumrelevance_city'] < .8] return df