Пример #1
0
def _bq_to_feast(conn_id: str, project: str, sql: str, feature_set: str):
    feast_hook = FeastHook(conn_id)
    client = feast_hook.get_client(project)
    bq = BigQueryHook(use_legacy_sql=False, location='US')
    features_df = bq.get_pandas_df(sql)
    # ingest features into feast (a partitioned BQ table)
    client.ingest(feature_set, features_df)
Пример #2
0
def bq_to_pubsub_query_executor(**kwargs):
    """Executes a custom detector query in BigQuery and passes the results to the next task"""

    query = kwargs['templates_dict']['query']
    logging.info(query)
    bigquery_hook = BigQueryHook(use_legacy_sql=False)
    df = bigquery_hook.get_pandas_df(sql=query)

    messages = [{
        'data': b64e(row.to_json().encode()).decode()
    } for index, row in df.iterrows()]
    """splitting the array to 1000 size chunks (PubSub limit)"""
    messages_chunks = chunks(messages, 1000)
    pubsub_hoook = PubSubHook()
    for chunk in messages_chunks:
        pubsub_hoook.publish(project=gcp_project,
                             topic=pubsub_topic,
                             messages=chunk)
Пример #3
0
class BigQueryToFeastFeatureSetOperator(BaseOperator):
    def __init__(self, conn_id: str, project: str, feature_set_name: str,
                 entity_name: str, sql: str, *args, **kwargs):
        super().__init__(*args, **kwargs)
        self.feast_client = FeastHook(conn_id).get_client(project)
        self.project = project
        self.feature_set_name = feature_set_name
        self.entity_name = entity_name
        self.sql = sql
        self.bq = BigQueryHook(use_legacy_sql=False, location='US')

    def execute(self, context):
        features_df = self.bq.get_pandas_df(self.sql)
        fs = FeatureSet(
            self.feature_set_name,
            max_age=Duration(seconds=86400),
            entities=[Entity(name=self.entity_name, dtype=ValueType.INT64)])
        fs.infer_fields_from_df(features_df, replace_existing_features=True)
        self.feast_client.apply(fs)
    def execute(self, context):
        if self.bq_cursor is None:
            self.log.info('Executing: %s', self.sql)
            hook = BigQueryHook(
                bigquery_conn_id=self.bigquery_conn_id,
                use_legacy_sql=self.use_legacy_sql,
                delegate_to=self.delegate_to,
                location=self.location,
            )
            conn = hook.get_conn()
            self.bq_cursor = conn.cursor()
        job_id = self.bq_cursor.run_query(
            sql=self.sql,
            destination_dataset_table=self.destination_dataset_table,
            write_disposition=self.write_disposition,
            allow_large_results=self.allow_large_results,
            flatten_results=self.flatten_results,
            udf_config=self.udf_config,
            maximum_billing_tier=self.maximum_billing_tier,
            maximum_bytes_billed=self.maximum_bytes_billed,
            create_disposition=self.create_disposition,
            query_params=self.query_params,
            labels=self.labels,
            schema_update_options=self.schema_update_options,
            priority=self.priority,
            time_partitioning=self.time_partitioning,
            api_resource_configs=self.api_resource_configs,
            cluster_fields=self.cluster_fields,
        )
        context['task_instance'].xcom_push(key='job_id', value=job_id)

        df = hook.get_pandas_df(self.sql)

        if self.sort_by is not None:
            df.sort_values('self.sort_by')

        list_to_return = df.astype(str).to_dict('index')
        print(list_to_return)
        return list_to_return
def get_df_and_types(**context):
    '''Set the columns and types of a csv to create a table with SQL.
    Save a CSV to import later'''
    def traducir(x):
        '''Translate panda's type to SQL type'''
        tipoSQL = []
        tr = ''
        for i in x:
            if i == 'float64':
                tr = 'float'
            elif i == 'int64':
                tr = 'int'
            else:
                tr = 'text'
            tipoSQL.append(str(tr))
        return (tipoSQL)

    bq = BigQueryHook(bigquery_conn_id=cfg.bigquery_conn_id,
                      use_legacy_sql=False)

    df = bq.get_pandas_df(QUERY)
    df.to_csv('{}/cloudSQLexport_temp.csv'.format(DIR_TMP),
              index=None,
              header=False)

    df = pd.DataFrame(df.dtypes)
    df.columns = ['tipo']
    df['nombre'] = df.index
    df['tipoSQL'] = traducir(df['tipo'])
    df['col_sql'] = df.apply(
        lambda x: "{} {} NULL".format(x['nombre'], x['tipoSQL']), axis=1)

    sqlcolumnas = ", ".join(df['col_sql'])
    context['ti'].xcom_push(key='SQL',
                            value="CREATE TABLE {} ({})".format(
                                TABLE, sqlcolumnas))
Пример #6
0
    def run(self):
        from airflow.contrib.hooks.bigquery_hook import BigQueryHook

        hook = BigQueryHook(bigquery_conn_id="bigquery_default",
                            use_legacy_sql=False)
        self.output = hook.get_pandas_df(self.query)