def get_data(bq_client, bq_storage_client): city_raw_data = get_raw_data(bq_client, bq_storage_client, "light_funnel_dau_city") (city_clean_data, city_clean_training_data) = prepare_data(city_raw_data, s2d('2016-04-08'), s2d('2020-01-30')) city_forecast_data = forecast(city_clean_training_data, city_clean_data) country_raw_data = get_raw_data(bq_client, bq_storage_client, "light_funnel_dau_country") (country_clean_data, country_clean_training_data) = prepare_data(country_raw_data, s2d('2016-04-08'), s2d('2020-01-30')) country_forecast_data = forecast(country_clean_training_data, country_clean_data) city_forecast_data.update(country_forecast_data) return city_forecast_data
def dataFilter(data, product): startDates = { "desktop_global": s2d('2016-04-08'), "fxa_global": s2d('2018-03-20'), "fxa_tier1": s2d('2018-03-20'), "Fennec Android": s2d('2017-03-04'), "Focus iOS": s2d('2017-12-06'), "Focus Android": s2d('2017-07-17'), "Fennec iOS": s2d('2017-03-03'), "Fenix": s2d('2019-07-03'), "Firefox Lite": s2d('2017-03-04'), "FirefoxForFireTV": s2d('2018-02-04'), "FirefoxConnect": s2d('2018-10-10'), "nondesktop_nofire_global": s2d('2017-01-30'), "nondesktop_nofire_tier1": s2d('2017-01-30'), } anomalyDates = { "desktop_global": [s2d('2019-05-16'), s2d('2019-06-07')], "Focus Android": [s2d('2018-09-01'), s2d('2019-03-01')], "Fennec iOS": [s2d('2017-11-08'), s2d('2017-12-31')], } temp = data.copy() if product in startDates: startDate = startDates[product] # noqa: F841 temp = temp.query("ds >= @startDate") if product in anomalyDates: anomalyStartDate = anomalyDates[product][0] # noqa: F841 anomalyEndDate = anomalyDates[product][1] # noqa: F841 temp = temp.query("(ds < @anomalyStartDate) | (ds > @anomalyEndDate)") return temp
def _getSinglePrediciton(model, data, trainingEndDate, targetDate): model.fit(data.query("ds <= @trainingEndDate")) forecast_period = pd.DataFrame({'ds': [s2d(targetDate)]}) forecast = model.predict(forecast_period) return (forecast.yhat[0], forecast.yhat_lower[0], forecast.yhat_upper[0])
# This Source Code Form is subject to the terms of the Mozilla Public # License, v. 2.0. If a copy of the MPL was not distributed with this # file, You can obtain one at http://mozilla.org/MPL/2.0/. import pandas as pd from fbprophet import Prophet from dscontrib.jmccrosky.forecast.utils import s2d # The only holidays we have identified the need to explicitly model are Chinese # New Year and Holi chinese_new_year = pd.DataFrame({ 'ds': [ s2d("2016-02-08"), s2d("2017-01-28"), s2d("2018-02-16"), s2d("2019-02-05"), s2d("2020-01-25") ], 'holiday': "chinese_new_year", 'lower_window': -20, 'upper_window': 20, }) holi = pd.DataFrame({ 'ds': [ s2d("2016-03-06"), s2d("2017-03-13"), s2d("2018-03-02"),
def pipeline(bq_client, bq_storage_client, output_bq_client): metrics = { "light_funnel_dau_city": "desktop_dau", "light_funnel_dau_country": "desktop_dau", "light_funnel_mean_active_hours_per_profile_city": "mean_active_hours_per_client", "light_funnel_mean_active_hours_per_profile_country": "mean_active_hours_per_client", } output_data = pd.DataFrame( { "date": [], "metric": [], "deviation": [], "ci_deviation": [], "geography": [], }, columns=["date", "metric", "deviation", "ci_deviation", "geography"]) for metric in metrics.keys(): raw_data = get_raw_data(bq_client, bq_storage_client, metric) (clean_data, clean_training_data) = prepare_data(raw_data, s2d('2016-04-08'), s2d('2020-01-30')) forecast_data = forecast(clean_training_data, clean_data) for geo in forecast_data: output_data = pd.concat([ output_data, pd.DataFrame( { "date": pd.to_datetime( forecast_data[geo].ds).dt.strftime("%Y-%m-%d"), "metric": metrics[metric], "deviation": forecast_data[geo].delta, "ci_deviation": forecast_data[geo].ci_delta, "geography": geo, }, columns=[ "date", "metric", "deviation", "ci_deviation", "geography" ]) ], ignore_index=True) dataset_ref = output_bq_client.dataset("analysis") table_ref = dataset_ref.table("deviations") try: output_bq_client.delete_table(table_ref) except NotFound: pass schema = [ bigquery.SchemaField('date', 'DATE', mode='REQUIRED'), bigquery.SchemaField('metric', 'STRING', mode='REQUIRED'), bigquery.SchemaField('deviation', 'FLOAT', mode='REQUIRED'), bigquery.SchemaField('ci_deviation', 'FLOAT', mode='REQUIRED'), bigquery.SchemaField('geography', 'STRING', mode='REQUIRED'), ] table = bigquery.Table(table_ref, schema=schema) table = output_bq_client.create_table(table) n = len(output_data) for i in range(0, n, 10000): errors = output_bq_client.insert_rows( table, list(output_data[i:min(i + 10000, n)].itertuples(index=False, name=None))) return (output_data, errors)