def extract_google_analytics_data(gcp_conn_id: str, start_date: str, end_date: str, locations: str): bq_hook = BigQueryHook(gcp_conn_id=gcp_conn_id) cities = ','.join(['"{}"'.format(c["name"]) for c in locations]) countries = ','.join(['"{}"'.format(c["country"]) for c in locations]) states = ','.join(['"{}"'.format(c["state"]) for c in locations]) query = """SELECT fullVisitorId, date, geoNetwork.city as city, geoNetwork.country as country, geoNetwork.region as region, product.v2ProductCategory as productCategory, product.v2ProductName as productName, hits.eCommerceAction.action_type as action_type, hits.eCommerceAction.step as action_step, product.productQuantity as quantity, product.productPrice as price, product.productRevenue as revenue, product.isImpression as isImpression, hits.transaction.transactionId as transactionId, hits.transaction.transactionRevenue as transactionRevenue, hits.transaction.transactionTax as transactionTax, hits.transaction.transactionShipping as transactionShipping, FROM `bigquery-public-data.google_analytics_sample.ga_sessions_*`, UNNEST(hits) as hits, UNNEST(hits.product) as product WHERE _TABLE_SUFFIX BETWEEN '{start_date}' AND '{end_date}' AND geoNetwork.country IN ({countries}) AND geoNetwork.city IN ({cities}) AND geoNetwork.region IN ({states}) ORDER BY date ASC""".format(start_date=start_date, end_date=end_date, countries=countries, cities=cities, states=states) df = bq_hook.get_pandas_df(sql=query, dialect="standard") df["location_name"] = df.apply(lambda row: get_location_string( row['country'], row['city'], row['region']), axis=1) df["date"] = pd.to_datetime(df["date"]) df["price"] = df["price"] / (10**6) df["revenue"] = df["revenue"] / (10**6) df["transactionRevenue"] = df["transactionRevenue"] / (10**6) df["transactionTax"] = df["transactionTax"] / (10**6) df["transactionShipping"] = df["transactionShipping"] / (10**6) df = df.set_index(["date", "location_name"]) return df.to_csv(date_format="%Y-%m-%d %H:%M:%S")
def execute(self, context): bq = BigQueryHook(gcp_conn_id=self.gcp_conn_id, use_legacy_sql=False) total_rows = [] # self.log.info("{credentials}") for table in self.table_list: q = self.sql.format(table) response = bq.get_pandas_df(q) if response.empty: response_value = 0 else: response_value = response['f0_'].iloc[0] total_rows.append(response_value) if sum(total_rows) != self.pass_value: logging.info(sum(total_rows)) logging.info('Tests failed') logging.info(total_rows) logging.info(self.pass_value) raise ValueError('Data quality check failed')