def _bq_to_feast(conn_id: str, project: str, sql: str, feature_set: str): feast_hook = FeastHook(conn_id) client = feast_hook.get_client(project) bq = BigQueryHook(use_legacy_sql=False, location='US') features_df = bq.get_pandas_df(sql) # ingest features into feast (a partitioned BQ table) client.ingest(feature_set, features_df)
def bq_to_pubsub_query_executor(**kwargs): """Executes a custom detector query in BigQuery and passes the results to the next task""" query = kwargs['templates_dict']['query'] logging.info(query) bigquery_hook = BigQueryHook(use_legacy_sql=False) df = bigquery_hook.get_pandas_df(sql=query) messages = [{ 'data': b64e(row.to_json().encode()).decode() } for index, row in df.iterrows()] """splitting the array to 1000 size chunks (PubSub limit)""" messages_chunks = chunks(messages, 1000) pubsub_hoook = PubSubHook() for chunk in messages_chunks: pubsub_hoook.publish(project=gcp_project, topic=pubsub_topic, messages=chunk)
class BigQueryToFeastFeatureSetOperator(BaseOperator): def __init__(self, conn_id: str, project: str, feature_set_name: str, entity_name: str, sql: str, *args, **kwargs): super().__init__(*args, **kwargs) self.feast_client = FeastHook(conn_id).get_client(project) self.project = project self.feature_set_name = feature_set_name self.entity_name = entity_name self.sql = sql self.bq = BigQueryHook(use_legacy_sql=False, location='US') def execute(self, context): features_df = self.bq.get_pandas_df(self.sql) fs = FeatureSet( self.feature_set_name, max_age=Duration(seconds=86400), entities=[Entity(name=self.entity_name, dtype=ValueType.INT64)]) fs.infer_fields_from_df(features_df, replace_existing_features=True) self.feast_client.apply(fs)
def execute(self, context): if self.bq_cursor is None: self.log.info('Executing: %s', self.sql) hook = BigQueryHook( bigquery_conn_id=self.bigquery_conn_id, use_legacy_sql=self.use_legacy_sql, delegate_to=self.delegate_to, location=self.location, ) conn = hook.get_conn() self.bq_cursor = conn.cursor() job_id = self.bq_cursor.run_query( sql=self.sql, destination_dataset_table=self.destination_dataset_table, write_disposition=self.write_disposition, allow_large_results=self.allow_large_results, flatten_results=self.flatten_results, udf_config=self.udf_config, maximum_billing_tier=self.maximum_billing_tier, maximum_bytes_billed=self.maximum_bytes_billed, create_disposition=self.create_disposition, query_params=self.query_params, labels=self.labels, schema_update_options=self.schema_update_options, priority=self.priority, time_partitioning=self.time_partitioning, api_resource_configs=self.api_resource_configs, cluster_fields=self.cluster_fields, ) context['task_instance'].xcom_push(key='job_id', value=job_id) df = hook.get_pandas_df(self.sql) if self.sort_by is not None: df.sort_values('self.sort_by') list_to_return = df.astype(str).to_dict('index') print(list_to_return) return list_to_return
def get_df_and_types(**context): '''Set the columns and types of a csv to create a table with SQL. Save a CSV to import later''' def traducir(x): '''Translate panda's type to SQL type''' tipoSQL = [] tr = '' for i in x: if i == 'float64': tr = 'float' elif i == 'int64': tr = 'int' else: tr = 'text' tipoSQL.append(str(tr)) return (tipoSQL) bq = BigQueryHook(bigquery_conn_id=cfg.bigquery_conn_id, use_legacy_sql=False) df = bq.get_pandas_df(QUERY) df.to_csv('{}/cloudSQLexport_temp.csv'.format(DIR_TMP), index=None, header=False) df = pd.DataFrame(df.dtypes) df.columns = ['tipo'] df['nombre'] = df.index df['tipoSQL'] = traducir(df['tipo']) df['col_sql'] = df.apply( lambda x: "{} {} NULL".format(x['nombre'], x['tipoSQL']), axis=1) sqlcolumnas = ", ".join(df['col_sql']) context['ti'].xcom_push(key='SQL', value="CREATE TABLE {} ({})".format( TABLE, sqlcolumnas))
def run(self): from airflow.contrib.hooks.bigquery_hook import BigQueryHook hook = BigQueryHook(bigquery_conn_id="bigquery_default", use_legacy_sql=False) self.output = hook.get_pandas_df(self.query)