Пример #1
0
def setUpModule():
    Config.CLIENT = bigquery.Client()
Пример #2
0
from absl import app, flags  #absl intstead of argparse flags library
from google.cloud import bigquery

from peloton_client import peloton_client

FLAGS = flags.FLAGS
flags.DEFINE_string('peloton_username', None, 'Your Peloton username.')
flags.DEFINE_string('peloton_password', None, 'Your Peloton password.')
flags.DEFINE_string('table_id', None, 'The BigQuery table name to store data.')
flags.mark_flag_as_required('peloton_username')
flags.mark_flag_as_required('peloton_password')
flags.mark_flag_as_required('table_id')

DATASET = 'peloton'
BQ_CLIENT = bigquery.Client()  #auth
JOB_CONFIG = bigquery.LoadJobConfig(write_disposition="WRITE_APPEND")


def extract_data(input_data):
    output_dict = {}
    for x in input_data:
        output_dict[x.get('slug')] = x.get('value')
    return output_dict


def get_workout_data(workouts):
    output_workouts = []
    for workout in workouts:
        workout_metrics = PT_CLIENT.fetch_workout_metrics(workout.get('id'))

        workout_core_stats = extract_data(workout_metrics.get('summaries'))
import os, sys
from datetime import datetime

from matplotlib.patches import Patch
from matplotlib.lines import Line2D

import bigquery_functions as bqf
import graph_functions as grf

# Global Directories
from global_config import config
data_dir = config.get_property('data_dir')
analysis_dir = config.get_property('analysis_dir')

# Starts the BigQuery Client
client = bigquery.Client(location="US")

# Constants
# -------------
date_format = "%Y-%m-%d"
eps = 0.001
total_places = 10
percentage = 0.1  # Percentage of top places
round_number = 4  # 15m
days_go_back = 14  # Two weeks
div = 1200  # in meters for the Soft Plus

MAX_RESOLUTION = 5
MAX_DISTANCE = 3
MAX_TIME = 2
 def setUpClass(cls):
     cls._bq_cli = bq.Client()
Пример #5
0
	import datetime
	import get_data_verifly as gdv
	import metrics
	import gen_data
    	from verifly_metrics import merge_data

	'''
	The main script function called daily via a cron on a google VM.
	Function runs the extract, and transform scripts.
	Joins the result with the most recent version from GBQ and reuploads the result
	'''

	#consruct client and login to goolg api
	credentials = service_account.Credentials.from_service_account_file('/home/nick/adjust/keys/vfly/verifly-237116-82a9642924f2.json')
	project_id = 'verifly-237116'
	client = bigquery.Client(credentials = credentials, project = project_id)

	#create two date strings. one for today, one for yesterday
	yesterday = datetime.date.today() - datetime.timedelta(days=1)
	last30 = yesterday - datetime.timedelta(days=30)
	yesterday_str = str(yesterday)
	last30_str = str(last30)

	#naming tables
	dataset_id = 'adjust_deliverables'
	csv_name = 'vfly_deliverables_metrics.csv' #+ today
	table_name_bigquery = "vfly_deliverables_metrics"
	local_path = "/home/nick/adjust/data/verifly/deliverables/" + csv_name
	print("Local path: " + local_path)

	#download old data as dataframe from google big query
Пример #6
0
 def start_bundle(self):
     if not self.bigquery_client:
         self.bigquery_client = bigquery.Client()
     self.dataset_location = self.get_dataset_location()
     self.load_jobs = {}
from google.cloud import firestore
from google.cloud import pubsub_v1
from google.cloud import storage
import pytz



PROJECT_ID = os.getenv('GCP_PROJECT')
BQ_DATASET = 'mydataset'
BQ_TABLE = 'mytable'
ERROR_TOPIC = 'projects/%s/topics/%s' % (PROJECT_ID, 'streaming_error_topic')
SUCCESS_TOPIC = 'projects/%s/topics/%s' % (PROJECT_ID, 'streaming_success_topic')
DB = firestore.Client()
CS = storage.Client()
PS = pubsub_v1.PublisherClient()
BQ = bigquery.Client()


def streaming(data, context):
    '''This function is executed whenever a file is added to Cloud Storage'''
    bucket_name = data['bucket']
    file_name = data['name']
    db_ref = DB.document(u'streaming_files/%s' % file_name)
    if _was_already_ingested(db_ref):
        _handle_duplication(db_ref)
    else:
        try:
            _insert_into_bigquery(bucket_name, file_name)
            _handle_success(db_ref)
        except Exception:
            _handle_error(db_ref)
Пример #8
0
def load_new_data(dataset_id, table_name, next_load_date, end_load_date):
    '''
    Queries different snippet related GA properties and loads results to a permanent table in bigquery
    :param dataset_id: Name of dataset to be loaded into
    :param table_name: Name of table to be loaded into
    :param next_load_date: Earliest date to be loaded into table_name
    :param end_load_date: Latest date to be loaded into table_name
    :return:
    '''
    while next_load_date < end_load_date:
        # Set dates required for loading new data
        next_load_date = datetime.strftime(next_load_date, '%Y%m%d')
        logging.info(f'Starting load for next load date: {next_load_date}')
        client = bigquery.Client(project='ga-mozilla-org-prod-001')
        load_dataset_id = dataset_id
        load_table_name = table_name
        load_table_suffix = next_load_date
        load_table_id = f'{load_table_name.lower()}_{load_table_suffix}'

        # Configure load job
        dataset_ref = client.dataset(load_dataset_id)
        table_ref = dataset_ref.table(load_table_id)
        load_job_config = bigquery.QueryJobConfig()  # load job call
        load_job_config.schema = [
            bigquery.SchemaField('date', 'DATE'),
            bigquery.SchemaField('country', 'STRING'),
            bigquery.SchemaField('source', 'STRING'),
            bigquery.SchemaField('medium', 'STRING'),
            bigquery.SchemaField('campaign', 'STRING'),
            bigquery.SchemaField('content', 'STRING'),
            bigquery.SchemaField('sessions', 'INTEGER'),
            bigquery.SchemaField('nonFXSessions', 'INTEGER'),
            bigquery.SchemaField('downloads', 'INTEGER'),
            bigquery.SchemaField('nonFXDownloads', 'INTEGER'),
            bigquery.SchemaField('countryCleaned', 'STRING')
        ]  # Define schema
        load_job_config.time_partitioning = bigquery.TimePartitioning(
            type_=bigquery.TimePartitioningType.DAY,
            field='date',
        )
        load_job_config.write_disposition = 'WRITE_APPEND'  # Options are WRITE_TRUNCATE, WRITE_APPEND, WRITE_EMPTY
        load_job_config.destination = table_ref
        sql = f"""
        WITH parameters as(
        SELECT
          '{load_table_suffix}' as startDate,
          '{load_table_suffix}' as endDate
        ),
        
        sessionsTable as (
          SELECT
            PARSE_DATE('%Y%m%d', date) as date,
            CASE WHEN country IS NULL THEN '' ELSE country END as country,
            CASE WHEN source IS NULL THEN '' ELSE source END as source,
            CASE WHEN medium IS NULL THEN '' ELSE medium END as medium,
            CASE WHEN campaign IS NULL THEN '' ELSE campaign END as campaign,
            CASE WHEN content IS NULL THEN '' ELSE content END as content,
            SUM(sessions) as sessions,
            SUM(CASE WHEN browser != 'Firefox' THEN sessions ELSE 0 END) as nonFXSessions
            FROM(
              SELECT
                date,
                country,
                source,
                medium,
                campaign,
                content,
                browser,
                COUNT(DISTINCT visitIdentifier) as sessions
              FROM(
                  SELECT
                    date AS date,
                    CONCAT(CAST(fullVisitorId AS string),CAST(visitId AS string)) as visitIdentifier,
                    geoNetwork.country as country,
                    trafficSource.source as source,
                    trafficSource.medium as medium,
                    trafficSource.campaign as campaign,
                    trafficSource.adcontent as content,
                    device.browser as browser
                  FROM
                    `ga-mozilla-org-prod-001.65789850.ga_sessions_*`,
                    UNNEST (hits) AS hits
                  WHERE
                    _TABLE_SUFFIX >= (SELECT parameters.startDate from parameters)
                    AND _TABLE_SUFFIX <= (SELECT parameters.startDate from parameters)
                    AND totals.visits = 1
                  GROUP BY
                    date,visitIdentifier, country, source, medium, campaign, content, browser)
              GROUP BY
                date, country, source, medium, campaign, content,browser)
          GROUP BY date, country, source, medium, campaign, content),
        
        
        downloadsTable as(
        SELECT
          PARSE_DATE('%Y%m%d', date) as date,
          CASE WHEN country IS NULL THEN '' ELSE country END as country,
          CASE WHEN source IS NULL THEN '' ELSE source END as source,
          CASE WHEN medium IS NULL THEN '' ELSE medium END as medium,
          CASE WHEN campaign IS NULL THEN '' ELSE campaign END as campaign,
          CASE WHEN content IS NULL THEN '' ELSE content END as content,
          SUM(IF(downloads > 0,1,0)) as downloads,
          SUM(IF(downloads > 0 AND browser != 'Firefox',1,0)) as nonFXDownloads
        FROM (SELECT
          date AS date,
          fullVisitorId as visitorId,
          visitNumber as visitNumber,
          geoNetwork.country as country,
          trafficSource.source as source,
          trafficSource.medium as medium,
          trafficSource.campaign as campaign,
          trafficSource.adcontent as content,
          device.browser as browser,
          SUM(IF (hits.eventInfo.eventAction = "Firefox Download",1,0)) as downloads
        FROM
          `ga-mozilla-org-prod-001.65789850.ga_sessions_*`,
          UNNEST (hits) AS hits
        WHERE
          _TABLE_SUFFIX >= (SELECT parameters.startDate from parameters)
          AND _TABLE_SUFFIX <= (SELECT parameters.startDate from parameters)
          AND hits.type = 'EVENT'
          AND hits.eventInfo.eventCategory IS NOT NULL
          AND hits.eventInfo.eventLabel LIKE "Firefox for Desktop%"
        GROUP BY
          date,visitorID, visitNumber, country, source, medium, campaign, content, browser)
        GROUP BY date, country, source, medium, campaign, content
        ),
        
        siteData as (
        SELECT
          sessionsTable.date,
          sessionsTable.country,
          sessionsTable.source,
          sessionsTable.medium,
          sessionsTable.campaign,
          sessionsTable.content,
          SUM(sessionsTable.sessions) as sessions,
          SUM(sessionsTable.nonFXSessions) as nonFXSessions,
          SUM(downloadsTable.downloads) as downloads,
          SUM(downloadsTable.nonFXDownloads) as nonFXDownloads
        FROM
          sessionsTable
        LEFT JOIN
          downloadsTable
        ON
          sessionsTable.date = downloadsTable.date
          AND sessionsTable.country = downloadsTable.country
          AND sessionsTable.source = downloadsTable.source
          AND sessionsTable.medium = downloadsTable.medium
          AND sessionsTable.campaign = downloadsTable.campaign
          AND sessionsTable.content = downloadsTable.content
        GROUP BY
          date, country, source, medium, campaign, content
        ),
        
        cleanedSiteData as (
        SELECT
          siteData.*,
          standardizedCountryList.standardizedCountry as countryCleaned
        FROM
          siteData
        LEFT JOIN
          `lookupTables.standardizedCountryList` as standardizedCountryList
        ON
          siteData.country = standardizedCountryList.rawCountry
        )
        
        SELECT * from cleanedSiteData
        """

        # Run Load Job
        query_job = client.query(
            sql,
            # Location must match that of the dataset(s) referenced in the query
            # and of the destination table.
            location='US',
            job_config=load_job_config)  # API request - starts the query

        query_job.result()  # Waits for the query to finish
        logging.info(f'Query results loaded to table {table_ref.path}')

        # Set next_load_date
        next_load_date = datetime.strptime(next_load_date,
                                           '%Y%m%d') + timedelta(1)
    return
Пример #9
0
 def __init__(self, project, sql, *args, **kwargs):
     super().__init__(project, sql)
     self.client = bigquery.Client(project=project)
def bigquery_client():
    yield bigquery.Client()
Пример #11
0
    data_connector_name="default_runtime_data_connector_name",
    data_asset_name="asset_a",  # this can be anything that identifies this data
    runtime_parameters={"query": "SELECT * from demo.taxi_data LIMIT 10"},
    batch_identifiers={"default_identifier_name": "default_identifier"},
)
context.create_expectation_suite(
    expectation_suite_name="test_suite", overwrite_existing=True
)
validator: Validator = context.get_validator(
    batch_request=batch_request, expectation_suite_name="test_suite"
)
assert validator

# What is the name of the temp_table that was created as part of `get_validator`?
temp_table_name: str = validator.active_batch.data.selectable.description
client: bigquery.Client = bigquery.Client()
project: str = client.project
dataset_ref: bigquery.DatasetReference = bigquery.DatasetReference(
    project, bigquery_dataset
)
table_ref: bigquery.TableReference = dataset_ref.table(temp_table_name)
table: bigquery.Table = client.get_table(table_ref)

# what is its expiration
expected_expiration: datetime.datetime = datetime.datetime.now(
    datetime.timezone.utc
) + datetime.timedelta(days=1)
assert table.expires <= expected_expiration

# Ensure that passing in `bigquery_temp_table` will fire off a DeprecationWarning
batch_request: RuntimeBatchRequest = RuntimeBatchRequest(
Пример #12
0
def client():
    return bigquery.Client()
Пример #13
0
def subscriber():
    while True:
        counter = 10
        temp1 = []
        while (counter != -1):

            pubsub_client = pubsub.Client()
            topic_name = 'my-new-topic'
            topic = pubsub_client.topic(topic_name)
            subscription_name = 'sub1'
            subscription = topic.subscription(subscription_name)

            #subscription.create()

            #for subscription in topic.list_subscriptions():
            #	   print(subscription.name)

            #logging.debug('Subscription {} created on topic {}.'.format(
            #	subscription.name, topic.name))

            try:
                results = subscription.pull(return_immediately=True)
                if counter != 0:
                    #print counter
                    print('Received {} messages.'.format(len(results)))

                    for ack_id, message in results:
                        #print('* {}: {}, {}'.format(
                        #message.message_id, message.data, message.attributes))
                        temp1.append(str(message.data))
                    #var=var+1
                    #print results
                    #print counter
                    #print b1
                    #ts = self.request.get("ts")
                else:
                    i = 0
                    while (i != 10):
                        #print len(temp1)
                        #print i
                        #print temp1[i]
                        try:
                            b = json.loads(temp1[i])
                            #print('json load: {}'.format(b))
                        except:
                            print('Cannot able to load')
                            print i

                        try:
                            #defing dataset variables
                            dataset_name = 'searce_poc_vuukle'
                            table_name = 'page_impressions'
                            today = b["timestamp"]
                            table_name = "%s$%s" % (table_name, today)

                            #print "hello1"
                            #putting data into bigquery
                            bigquery_client = bigquery.Client()
                            dataset = bigquery_client.dataset(dataset_name)
                            table = dataset.table(table_name)
                            data = b
                            #time_stamp1=time.strftime("%c")

                            #data['timestamp'] = time_stamp1

                            # Reload the table to get the schema.
                            table.reload()

                            ## get the names of schema
                            temp = list()
                            for key in data:
                                temp.append(data[key])
                                #rows = [data]
                                #print rows
                            errors = table.insert_data([temp])
                            if not errors:
                                logging.debug('Loaded 1 row into {}:{}'.format(
                                    dataset_name, table_name))
                            else:
                                logging.error(errors)
                        except:
                            print "cannot load"
                        i = i + 1

                    # # # Acknowledge received messages. If you do not acknowledge, Pub/Sub will
                    # # # redeliver the message.
                if results:
                    subscription.acknowledge(
                        [ack_id for ack_id, message in results])
                    #print( "one acknowledged")
            except:
                print("next")
                #continue
            counter = counter - 1
 def setUpClass(cls):
     cls._bq_cli = bigquery.Client()
Пример #15
0
    def __init__(self,
                 bq_schema_filename=None,
                 input_bq_table=None,
                 hist_bq_table=None,
                 p_null=0.1,
                 n_keys=sys.maxint,
                 min_date='2000-01-01',
                 max_date=datetime.date.today().strftime('%Y-%m-%d'),
                 only_pos=True,
                 max_int=10**11,
                 max_float=float(10**11),
                 float_precision=2,
                 write_disp='WRITE_APPEND',
                 key_skew='None',
                 primary_key_cols=None,
                 dest_joining_key_col=None):
        """
        Args:
        bq_schema_filename (str): A path to a local or gcs file containing a
            BigQuery schema in a json file.
        p_null (float): The desired sparsity of the generated data.
        n_keys (int): The cardinality of foreign key columns (for generating
            joinable schemas).
        min_date (datetime.date): The earliest date to generate.
        max_date (datetime.date): The latest date to generate.
        only_pos (bool): Specifies whether to allow negative numbers to be
            generated.
        max_int (int): The upper bound for the range of integers
            to generate.
        max_float (float): The upper bound for the range of floats to
            generate.
        float_precision (int): The desired display precision for generated
            floats. (Note that BigQuery will cast all floats with double
            precision on the backend).
        primary_key_cols (str): The primary key for the generated data.
        dest_joining_key_col (str): The name of the key column in the table
            we are generating that joins to source_joining_key_col.
        """
        bq_cli = bq.Client()
        if bq_schema_filename is not None:
            try:
                # Handles json from google cloud storage or local.
                if bq_schema_filename.find('gs://') == 0:
                    bkt, path = bq_schema_filename.replace('gs://',
                                                           '').split('/', 1)
                    client = gcs.Client()
                    bucket = client.get_bucket(bkt)

                    blob = bucket.get_blob(path)
                    self.schema = json.loads(blob.download_as_string())
                else:
                    with open(bq_schema_filename, 'r') as json_file:
                        self.schema = json.load(json_file)
            except ValueError:
                logging.error("Not a valid json file! \n %s", str(ValueError))
            except AttributeError:
                logging.error("Could not find gcs file %s",
                              str(bq_schema_filename))
        elif input_bq_table:

            dataset_name, table_name = input_bq_table.split('.')
            bq_dataset = bq_cli.dataset(dataset_name)
            # This forms a TableReference object.
            bq_table_ref = bq_dataset.table(table_name)
            # Errors out if table doesn't exist.
            bq_table = bq_cli.get_table(bq_table_ref)

            # Quickly parse TableSchema object to list of dictionaries.
            self.schema = [{
                u'name': field.name,
                u'type': field.field_type,
                u'mode': field.mode
            } for field in bq_table.schema]
        if hist_bq_table:
            dataset_name, table_name = hist_bq_table.split('.')
            bq_dataset = bq_cli.dataset(dataset_name)
            # This forms a TableReference object.
            bq_table_ref = bq_dataset.table(table_name)
            # Errors out if table doesn't exist.
            bq_table = bq_cli.get_table(bq_table_ref)

            self.hist_bq_table = hist_bq_table
        else:
            self.hist_bq_table = None

        self.null_prob = float(p_null)
        self.n_keys = int(n_keys)
        self.min_date = datetime.datetime.strptime(min_date, "%Y-%m-%d").date()
        self.max_date = datetime.datetime.strptime(max_date, "%Y-%m-%d").date()
        self.only_pos = bool(only_pos)
        self.max_int = int(max_int)
        self.min_int = 0 if self.only_pos else -1 * self.max_int
        self.max_float = float(max_float)
        self.min_float = 0.0 if self.only_pos else -1.0 * self.max_float
        self.float_precision = int(float_precision)
        self.key_skew = key_skew
        self.dest_joining_key_col = dest_joining_key_col
        # Map the passed string representation of the desired disposition.
        # This will force early error if invalid write disposition.
        write_disp_map = {
            'WRITE_APPEND': beam.io.BigQueryDisposition.WRITE_APPEND,
            'WRITE_EMPTY': beam.io.BigQueryDisposition.WRITE_EMPTY,
            'WRITE_TRUNCATE': beam.io.BigQueryDisposition.WRITE_TRUNCATE
        }

        self.write_disp = write_disp_map[write_disp]
Пример #16
0
    Returns:
        [type]: [description]
    """
    original_string = os.environ.get(param,None)
    if original_string is not None:
        original_string = original_string.replace("'", "")

    return original_string

src_bucket = get_enviroment_variable('SOURCE_BUCKET_ID')
dataset_id = get_enviroment_variable('BQ_SETTINGS_DATASET_ID')
table_id='project_billing_info'
storage_client = storage.Client()
bucket = storage_client.get_bucket(src_bucket)
client = bq.Client()
dataset = client.dataset(dataset_id)
client.delete_table('cudinator_stg.project_billing_info', not_found_ok=True)  
for file in storage_client.list_blobs(bucket):
    if ('/mapping.json' in file.name):
        blob = bucket.get_blob(file.name)
        data =blob.download_as_string()
        dict_str = data.decode("UTF-8")
        data=eval(dict_str)

        project_details=[]
        billing_accounts=data.get("billing_accounts")
        # ETL WORK TO ADD BILLING NUMBER TO EACH PROJECT 
        for billingAccounts in billing_accounts:
            billing_Accounts_Projects=billingAccounts.get("projects")
            name=billingAccounts.get("name")
Пример #17
0
# Standalone script to depict BigQuery features
from __future__ import absolute_import
from google.cloud import bigquery

# Constants
PROJECT      = "famous-store-237108"
DATASET_UDM  = "UDM"
DATASET_CONS = "Consumption"
TABLE = "country_dim"

client = bigquery.Client () # Instantiate BigQuery client

query = ("SELECT column_name, data_type, is_nullable FROM `" + PROJECT + "." + DATASET_CONS + ".INFORMATION_SCHEMA.COLUMNS` WHERE table_name = '" + TABLE + "' limit 1000 ")
query_job = client.query (query)

rows = query_job.result ()

schema = {}
col_list = []

for row in rows:
    col_info = {}

    col_info ['name'] = row.column_name
    col_info ['type'] = row.data_type
    if row.is_nullable:
       col_info ['mode'] = 'REQUIRED'
    else:
        col_info ['mode'] = 'NULLABLE'

    col_list.append (col_info)
Пример #18
0
 def __init__(self, project_name=None, credentials=None):
     if credentials is None:
         self.bq_client = bigquery.Client()
Пример #19
0
    def runtest(self):
        """Run."""
        query_name = self.fspath.dirpath().basename
        query = read(f"{self.fspath.dirname.replace('tests', 'sql')}/query.sql")
        expect = load(self.fspath.strpath, "expect")

        tables: Dict[str, Table] = {}
        views: Dict[str, str] = {}

        # generate tables for files with a supported table extension
        for resource in next(os.walk(self.fspath))[2]:
            if "." not in resource:
                continue  # tables require an extension
            table_name, extension = resource.rsplit(".", 1)
            if table_name.endswith(".schema") or table_name in (
                "expect",
                "query_params",
            ):
                continue  # not a table
            if extension in TABLE_EXTENSIONS or extension in ("yaml", "json"):
                if extension in TABLE_EXTENSIONS:
                    source_format = TABLE_EXTENSIONS[extension]
                    source_path = os.path.join(self.fspath.strpath, resource)
                else:
                    source_format = TABLE_EXTENSIONS["ndjson"]
                    source_path = (self.fspath.strpath, table_name)
                if "." in table_name:
                    # combine project and dataset name with table name
                    original, table_name = (
                        table_name,
                        table_name.replace(".", "_").replace("-", "_"),
                    )
                    query = query.replace(original, table_name)
                tables[table_name] = Table(table_name, source_format, source_path)
            elif extension == "sql":
                if "." in table_name:
                    # combine project and dataset name with table name
                    original, table_name = (
                        table_name,
                        table_name.replace(".", "_").replace("-", "_"),
                    )
                    query = query.replace(original, table_name)
                views[table_name] = read(self.fspath.strpath, resource)

        # rewrite all udfs as temporary
        temp_udfs = parse_udf.sub_persisent_udfs_as_temp(query)
        if temp_udfs != query:
            query = temp_udfs
            # prepend udf definitions
            query = parse_udf.prepend_udf_usage_definitions(query)

        dataset_id = "_".join(self.fspath.strpath.split(os.path.sep)[-3:])
        if "CIRCLE_BUILD_NUM" in os.environ:
            dataset_id += f"_{os.environ['CIRCLE_BUILD_NUM']}"

        bq = bigquery.Client()
        with dataset(bq, dataset_id) as default_dataset:
            load_tables(bq, default_dataset, tables.values())
            load_views(bq, default_dataset, views)

            # configure job
            res_table = bigquery.TableReference(default_dataset, query_name)

            job_config = bigquery.QueryJobConfig(
                default_dataset=default_dataset,
                destination=res_table,
                query_parameters=get_query_params(self.fspath.strpath),
                use_legacy_sql=False,
                write_disposition=bigquery.WriteDisposition.WRITE_TRUNCATE,
            )

            # run query
            job = bq.query(query, job_config=job_config)
            result = list(coerce_result(*job.result()))
            result.sort(key=lambda row: json.dumps(row, sort_keys=True))
            expect.sort(key=lambda row: json.dumps(row, sort_keys=True))

            print_and_test(expect, result)
 def setUpClass(cls):
     cls.GCP_BIGQUERY_CLIENT = bigquery.Client(GCP_PROJECT_ID,
                                               location="EU")
     cls.DATASET = cls.__dataset()
     cls.__create_dataset_if_not_exists(cls.DATASET)
Пример #21
0
    def Do(self, input_dict: Dict[Text, List[types.Artifact]],
           output_dict: Dict[Text, List[types.Artifact]],
           exec_properties: Dict[Text, Any]):
        """Overrides the tfx_pusher_executor.

    Args:
      input_dict: Input dict from input key to a list of artifacts, including:
        - model_export: exported model from trainer.
        - model_blessing: model blessing path from model_validator.
      output_dict: Output dict from key to a list of artifacts, including:
        - model_push: A list of 'ModelPushPath' artifact of size one. It will
          include the model in this push execution if the model was pushed.
      exec_properties: Mostly a passthrough input dict for
        tfx.components.Pusher.executor.  custom_config.bigquery_serving_args is
        consumed by this class.  For the full set of parameters supported by
        Big Query ML, refer to https://cloud.google.com/bigquery-ml/

    Returns:
      None
    Raises:
      ValueError:
        If bigquery_serving_args is not in exec_properties.custom_config.
        If pipeline_root is not 'gs://...'
      RuntimeError: if the Big Query job failed.
    """
        self._log_startup(input_dict, output_dict, exec_properties)
        model_push = artifact_utils.get_single_instance(
            output_dict['model_push'])
        if not self.CheckBlessing(input_dict):
            model_push.set_int_custom_property('pushed', 0)
            return

        model_export = artifact_utils.get_single_instance(
            input_dict['model_export'])
        model_export_uri = model_export.uri

        custom_config = exec_properties.get('custom_config', {})
        bigquery_serving_args = custom_config.get('bigquery_serving_args',
                                                  None)
        # if configuration is missing error out
        if bigquery_serving_args is None:
            raise ValueError('Big Query ML configuration was not provided')

        bq_model_uri = '`{}`.`{}`.`{}`'.format(
            bigquery_serving_args['project_id'],
            bigquery_serving_args['bq_dataset_id'],
            bigquery_serving_args['model_name'])

        # Deploy the model.
        model_path = path_utils.serving_model_path(model_export_uri)

        if not model_path.startswith('gs://'):
            raise ValueError(
                'pipeline_root must be gs:// for BigQuery ML Pusher.')

        absl.logging.info(
            'Deploying the model to BigQuery ML for serving: {} from {}'.
            format(bigquery_serving_args, model_path))

        query = ("""
      CREATE OR REPLACE MODEL {}
      OPTIONS (model_type='tensorflow',
               model_path='{}')""".format(bq_model_uri,
                                          os.path.join(model_path, '*')))

        # TODO(zhitaoli): Refactor the executor_class_path creation into a common
        # utility function.
        executor_class_path = '%s.%s' % (self.__class__.__module__,
                                         self.__class__.__name__)
        with telemetry_utils.scoped_labels(
            {telemetry_utils.TFX_EXECUTOR: executor_class_path}):
            default_query_job_config = bigquery.job.QueryJobConfig(
                labels=telemetry_utils.get_labels_dict())
        client = bigquery.Client(
            default_query_job_config=default_query_job_config)

        try:
            query_job = client.query(query)
            query_job.result()  # Waits for the query to finish
        except Exception as e:
            raise RuntimeError('BigQuery ML Push failed: {}'.format(e))

        absl.logging.info(
            'Successfully deployed model {} serving from {}'.format(
                bq_model_uri, model_path))

        # Setting the push_destination to bigquery uri
        model_push.set_int_custom_property('pushed', 1)
        model_push.set_string_custom_property('pushed_model', bq_model_uri)
        def enrich_task(ds, **kwargs):
            template_context = kwargs.copy()
            template_context['ds'] = ds
            template_context['params'] = environment

            client = bigquery.Client()

            # Need to use a temporary table because bq query sets field modes to NULLABLE and descriptions to null
            # when writeDisposition is WRITE_TRUNCATE

            # Create a temporary table
            temp_table_name = '{task}_{milliseconds}'.format(
                task=task, milliseconds=int(round(time.time() * 1000)))
            temp_table_ref = client.dataset(dataset_name_temp).table(
                temp_table_name)

            schema_path = os.path.join(
                dags_folder,
                'resources/stages/enrich/schemas/{task}.json'.format(
                    task=task))
            schema = read_bigquery_schema_from_file(schema_path)
            table = bigquery.Table(temp_table_ref, schema=schema)

            description_path = os.path.join(
                dags_folder,
                'resources/stages/enrich/descriptions/{task}.txt'.format(
                    task=task))
            table.description = read_file(description_path)
            if time_partitioning_field is not None:
                table.time_partitioning = TimePartitioning(
                    field=time_partitioning_field)
            logging.info('Creating table: ' + json.dumps(table.to_api_repr()))
            table = client.create_table(table)
            assert table.table_id == temp_table_name

            # Query from raw to temporary table
            query_job_config = bigquery.QueryJobConfig()
            # Finishes faster, query limit for concurrent interactive queries is 50
            query_job_config.priority = bigquery.QueryPriority.INTERACTIVE
            query_job_config.destination = temp_table_ref

            sql_path = os.path.join(
                dags_folder,
                'resources/stages/enrich/sqls/{task}.sql'.format(task=task))
            sql_template = read_file(sql_path)
            sql = kwargs['task'].render_template('', sql_template,
                                                 template_context)
            print('Enrichment sql:')
            print(sql)

            query_job = client.query(sql,
                                     location='US',
                                     job_config=query_job_config)
            submit_bigquery_job(query_job, query_job_config)
            assert query_job.state == 'DONE'

            all_destination_projects = [(destination_dataset_project_id,
                                         dataset_name)]
            if copy_dataset_project_id is not None and len(copy_dataset_project_id) > 0 \
                    and copy_dataset_name is not None and len(copy_dataset_name) > 0:
                all_destination_projects.append(
                    (copy_dataset_project_id, copy_dataset_name))

            if load_all_partitions:
                for dest_project, dest_dataset_name in all_destination_projects:
                    # Copy temporary table to destination
                    copy_job_config = bigquery.CopyJobConfig()
                    copy_job_config.write_disposition = 'WRITE_TRUNCATE'
                    dest_table_name = '{task}'.format(task=task)
                    dest_table_ref = client.dataset(
                        dest_dataset_name,
                        project=dest_project).table(dest_table_name)
                    copy_job = client.copy_table(temp_table_ref,
                                                 dest_table_ref,
                                                 location='US',
                                                 job_config=copy_job_config)
                    submit_bigquery_job(copy_job, copy_job_config)
                    assert copy_job.state == 'DONE'
            else:
                for dest_project, dest_dataset_name in all_destination_projects:
                    # Merge
                    # https://cloud.google.com/bigquery/docs/reference/standard-sql/dml-syntax#merge_statement
                    merge_job_config = bigquery.QueryJobConfig()
                    # Finishes faster, query limit for concurrent interactive queries is 50
                    merge_job_config.priority = bigquery.QueryPriority.INTERACTIVE

                    merge_sql_path = os.path.join(
                        dags_folder,
                        'resources/stages/enrich/sqls/merge/merge_{task}.sql'.
                        format(task=task))
                    merge_sql_template = read_file(merge_sql_path)

                    merge_template_context = template_context.copy()
                    merge_template_context['params'][
                        'source_table'] = temp_table_name
                    merge_template_context['params'][
                        'destination_dataset_project_id'] = dest_project
                    merge_template_context['params'][
                        'destination_dataset_name'] = dest_dataset_name
                    merge_sql = kwargs['task'].render_template(
                        '', merge_sql_template, merge_template_context)
                    print('Merge sql:')
                    print(merge_sql)
                    merge_job = client.query(merge_sql,
                                             location='US',
                                             job_config=merge_job_config)
                    submit_bigquery_job(merge_job, merge_job_config)
                    assert merge_job.state == 'DONE'

            # Delete temp table
            client.delete_table(temp_table_ref)
Пример #23
0
from google.cloud import bigquery
import pandas as pd
import Queue
import threading
import datetime
import logging


# Set project Variables
projectid = "pythonproject-190701"
dataset_id = 'cmcdataset'
bqc = bigquery.Client(project=projectid)
job_config = bigquery.QueryJobConfig()
job_config.use_legacy_sql = False
dataset_ref = bqc.dataset(dataset_id)

def get_time_str():
    timestring = datetime.datetime.now().strftime("%Y-%m-%d %H:%M")
    return timestring

def query(query,bqc,job_config):
    query_job = bqc.query(query,job_config=job_config)
    return query_job

def query_job_2df(query_job):
    # Convert the Results Query to a List of lists
    data = list(x for x in query_job.result())
    # Convert the list of lists to a dataframe with columns based on the fields listed in the first row.
    df = pd.DataFrame.from_records(data[:],columns = sorted(data[0]._xxx_field_to_index, key=data[0]._xxx_field_to_index.__getitem__))
    return df
Пример #24
0
    def get_gbq_data(self, sim_config, local_cache_file):
        if not self.gbq_table:
            raise ValueError(
                f"gbq_table={self.gbq_table} is unset. " +
                "Set env variable for specific source, e.g. FLATFILES_GBQ_TABLE"
            )

        if not sim_config["identifier"]:
            raise ValueError(
                f"Invalid sim_config: sim_config[identifier]={sim_config['identifier']}"
            )

        # first get the table schema from GBQ and take the intersection of our
        # data spec columns to see which columns we can get from GBQ
        client = bigquery.Client(project=self.gcp_project)
        table = client.get_table(self.gbq_table)
        schema_columns = [_s.name for _s in table.schema]
        # take intersection preserving order in data spec
        gbq_columns = [
            _col for _col in self.data_spec.full.columns
            if _col in schema_columns
        ]

        # query will get data for entire data set per ID so that the cache can
        # be built up correctly.
        columns_str = ", ".join(gbq_columns)
        query_str = f"SELECT {columns_str}\n"
        query_str += f"FROM `{self.gbq_table}`\n"
        query_str += f"WHERE Identifier = '{sim_config['identifier']}'\n"

        # we will use pandas-gbq to read data to df
        # https://pandas-gbq.readthedocs.io/en/latest/
        # use_bqstorage_api=False
        # BigQuery Storage API allows downloading large (>125 MB) query results
        # more quickly at an increased cost. We are querying once per ID, which
        # should be no more than 50 MB.
        # max_results=1,000,000
        # 8760 * 12 = 105,120 records per thermostat-year
        # use max_results as guard to query accidentaly getting multiple datasets
        # there should never be 1,000,000 results for a single identifier

        _df = pandas_gbq.read_gbq(
            query_or_table=query_str,
            project_id=self.gcp_project,
            credentials=self.gbq_token,
            col_order=gbq_columns,
            reauth=True,
            dialect="standard",
            max_results=1000000,
            use_bqstorage_api=False,
            dtypes={
                k: v["dtype"]
                for k, v in self.data_spec.full.spec.items()
                if k in gbq_columns
            },
        )

        if _df.empty:
            logger.error((
                f"Identifier: {sim_config['identifier']}",
                f" not found in GBQ table: {self.gbq_table}",
            ))

        if local_cache_file:
            if os.path.isdir(os.path.dirname(local_cache_file)):
                # store as gzip compressed parquet file
                _df.to_parquet(local_cache_file,
                               compression="gzip",
                               index=False)
        else:
            logger.info(
                "GCSDataSource received no local_cache. Proceeding without caching."
            )

        return _df
Пример #25
0
def main():
    client = bigquery.Client()
    query = """
        SELECT 
            country_name 
        FROM 
            bigquery-public-data.covid19_open_data.covid19_open_data 
        GROUP BY 
            country_name 
        ORDER BY 
            country_name
    """

    country = []
    query_job = client.query(query)
    for row in query_job:
        country.append(row["country_name"])

    query = """
        SELECT
            population,
            AVG((population - cumulative_confirmed - cumulative_recovered - cumulative_deceased)) as S,
            AVG(cumulative_confirmed) as I,
            AVG(cumulative_recovered) as R
        FROM 
            `bigdata-301014.Domographics.population` 
        INNER JOIN 
            `bigquery-public-data.covid19_open_data.covid19_open_data`
        ON 
            location_key = key
        WHERE 
            LENGTH(location_key) = 2 
            AND country_name = '{country}'
        GROUP BY 
            population
        LIMIT 10
    """.format(country=country[1])

    query_job = client.query(query)

    n = 0
    S0 = 0
    I0 = 0
    R0 = 0
    for row in query_job:
        n = row["population"]
        S0 = row["S"]
        I0 = 1
        R0 = 0

    def sir_model(y, x, beta, gamma):
        sus = -beta * y[0] * y[1] / n
        rec = gamma * y[1]
        inf = -(sus + rec)
        return sus, inf, rec

    def fit_odeint(x, beta, gamma):
        print(" beta: ", beta, " gamma: ", gamma)
        return odeint(sir_model, (S0, I0, R0), x, args=(beta, gamma))[:,1]

    query = """
        SELECT
            date,
            IFNULL(cumulative_confirmed, 0) as I
        FROM 
            bigquery-public-data.covid19_open_data.covid19_open_data 
        WHERE 
            LENGTH(location_key) = 2 
            AND country_name = '{country}' 
    """.format(country=country[1])

    query_job = client.query(query)

    xdata = []
    ydata = []
    i = 1
    for row in query_job:
        ydata.append(float(row["I"]))
        #xdata.append(row["date"])
        xdata.append(i)
        i = i + 1

    xdata = np.array(xdata)
    ydata = np.array(ydata)

    print(xdata)
    print(ydata)
    popt, pcov = curve_fit(fit_odeint, xdata, ydata, maxfev=1000000000)

    print(np.round(np.sqrt(pcov[0][0]),3))
    print(np.round(np.sqrt(pcov[1][1]),3))
Пример #26
0
    parser.add_argument(
        '--speedFactor',
        help=
        'Example: 60 implies 1 hour of data sent to Cloud Pub/Sub in 1 minute',
        required=True,
        type=float)
    parser.add_argument(
        '--jitter',
        help='type of jitter to add: None, uniform, exp  are the three options',
        default='None')

    # set up BigQuery bqclient
    logging.basicConfig(format='%(levelname)s: %(message)s',
                        level=logging.INFO)
    args = parser.parse_args()
    bqclient = bq.Client(args.project)
    dataset = bqclient.get_dataset(
        bqclient.dataset('flights'))  # throws exception on failure

    # jitter?
    if args.jitter == 'exp':
        jitter = 'CAST (-LN(RAND()*0.99 + 0.01)*30 + 90.5 AS INT64)'
    elif args.jitter == 'uniform':
        jitter = 'CAST(90.5 + RAND()*30 AS INT64)'
    else:
        jitter = '0'

    # run the query to pull simulated events
    querystr = """\
SELECT
  EVENT,
Пример #27
0
    def download_file(self, key, stored_etag=None, manifest_id=None, start_date=None):
        """
        Download a file from GCP storage bucket.

        If we have a stored etag and it matches the current GCP blob, we can
        safely skip download since the blob/file content must not have changed.

        Args:
            key (str): name of the blob in the GCP storage bucket
            stored_etag (str): optional etag stored in our DB for comparison

        Returns:
            tuple(str, str) with the local filesystem path to file and GCP's etag.

        """
        try:
            if start_date:
                invoice_month = start_date.strftime("%Y%m")
                query = f"""
                SELECT {",".join(self.gcp_big_query_columns)}
                FROM {self.table_name}
                WHERE DATE(_PARTITIONTIME) >= '{self.query_date}'
                AND invoice.month = '{invoice_month}'
                """
                LOG.info(f"Using querying for invoice_month ({invoice_month})")
            else:
                query = f"""
                SELECT {",".join(self.gcp_big_query_columns)}
                FROM {self.table_name}
                WHERE DATE(_PARTITIONTIME) >= '{self.query_date}'
                """
            client = bigquery.Client()
            query_job = client.query(query)
        except GoogleCloudError as err:
            err_msg = (
                "Could not query table for billing information."
                f"\n  Provider: {self._provider_uuid}"
                f"\n  Customer: {self.customer_name}"
                f"\n  Response: {err.message}"
            )
            LOG.warning(err_msg)
            raise GCPReportDownloaderError(err_msg)
        directory_path = self._get_local_directory_path()
        full_local_path = self._get_local_file_path(directory_path, key)
        os.makedirs(directory_path, exist_ok=True)
        msg = f"Downloading {key} to {full_local_path}"
        LOG.info(log_json(self.request_id, msg, self.context))
        try:
            with open(full_local_path, "w") as f:
                writer = csv.writer(f)
                writer.writerow(self.gcp_big_query_columns)
                for row in query_job:
                    writer.writerow(row)
        except (OSError, IOError) as exc:
            err_msg = (
                "Could not create GCP billing data csv file."
                f"\n  Provider: {self._provider_uuid}"
                f"\n  Customer: {self.customer_name}"
                f"\n  Response: {exc}"
            )
            raise GCPReportDownloaderError(err_msg)

        msg = f"Returning full_file_path: {full_local_path}"
        LOG.info(log_json(self.request_id, msg, self.context))
        dh = DateHelper()
        return full_local_path, self.etag, dh.today
Пример #28
0
from google.cloud import storage

# Basic data about your project
project_id = "your-project-id"
bucket_name = "your-bucket-name"
bucket_folder = 'your-bucket-folder-name'
client_options = ClientOptions(api_endpoint=endpoint)

# Endpoint of the ML Model deployed on AI Platform
endpoint = 'your-ml-endpoint'
ml = discovery.build('ml-name', 'ml-version', client_options=client_options)
model_name = 'projects/{}/models/{}/versions/{}'.format(
    project, model, version)

# BigQuery
bq_client = bigquery.Client(project=project_id)
table_id = "project-id.dataset-name.table-name"

# Cloud Storage
storage_client = storage.Client()


# Configuration for Audio Data
class conf:
    sr = 16000
    duration = 3
    hop_length = 340 * duration
    fmin = 20
    fmax = sr // 2
    n_mels = 128
    n_fft = n_mels * 20
Пример #29
0
import flask
from datetime import datetime, date, timedelta
from google.cloud import bigquery
import concurrent.futures

try:
    import googleclouddebugger
    googleclouddebugger.enable()
except ImportError:
    pass

app = flask.Flask(__name__)
app.debug = True
bigquery_client = bigquery.Client()

app.config.update(SECRET_KEY='x123124asdflsadlf1!!',
                  SESSION_COOKIE_NAME='finedatalab_flask_session',
                  PERMANENT_SESSION_LIFETIME=timedelta(31))


@app.after_request
def set_response_headers(r):
    r.headers['Cache-Control'] = 'no-cache, no-store, must-revalidate'
    r.headers['Pragma'] = 'no-cache'
    r.headers['Expires'] = '0'
    return r


@app.route("/")
def main():
    return flask.render_template("index.html",
Пример #30
0
def Ejecutar():

    reload(sys)
    sys.setdefaultencoding('utf8')
    storage_client = storage.Client()
    bucket = storage_client.get_bucket('ct-telefonia')
    gcs_path = 'gs://ct-telefonia'
    sub_path = KEY_REPORT + '/'
    output = gcs_path + "/" + sub_path + fecha + ext
    blob = bucket.blob(sub_path + fecha + ext)
    dateini = request.args.get('dateini')
    dateend = request.args.get('dateend')

    if dateini is None:
        dateini = GetDate1
    else:
        dateini = dateini + hour1

    if dateend is None:
        dateend = GetDate2
    else:
        dateend = dateend + hour2

    client = bigquery.Client()
    QUERY = (
        'SELECT servidor, operacion, token, ipdial_code, id_cliente, cartera FROM telefonia.parametros_ipdial where Estado = "Activado" order by ipdial_code asc'
    )  #WHERE ipdial_code = "intcob-unisabaneta"
    query_job = client.query(QUERY)
    rows = query_job.result()
    data = ""

    try:
        os.remove(ruta_completa)  #Eliminar de aries
    except:
        print("Eliminado de aries")

    try:
        blob.delete()  #Eliminar del storage-----
    except:
        print("Eliminado de storage")

    try:
        QUERY2 = (
            'delete FROM `contento-bi.telefonia.agent_status` where date = ' +
            '"' + dateini[0:8] + '"')
        query_job = client.query(QUERY2)
        rows2 = query_job.result()
    except:
        print("Eliminado de bigquery")

    file = open(ruta_completa, "a")
    for row in rows:
        url = 'http://' + str(
            row.servidor
        ) + '/ipdialbox/api_reports.php?token=' + row.token + '&report=' + str(
            CODE_REPORT) + '&date_ini=' + dateini + '&date_end=' + dateend
        datos = requests.get(url).content

        # print(url)

        if len(requests.get(url).content) < 50:
            continue
        else:
            i = json.loads(datos)
            for rown in i:
                file.write(
                    str(rown["operation"]).encode('utf-8') + "|" +
                    str(rown["date"]) + "|" + str(rown["hour"]) + "|" +
                    str(rown["id_agent"]) + "|" +
                    str(rown["agent_identification"]) + "|" +
                    str(rown["agent_name"]).encode('utf-8') + "|" +
                    str(rown["CALLS"]) + "|" + str(rown["CALLS INBOUND"]) +
                    "|" + str(rown["CALLS OUTBOUND"]) + "|" +
                    str(rown["CALLS INTERNAL"]) + "|" +
                    str(rown["READY TIME"]) + "|" + str(rown["INBOUND TIME"]) +
                    "|" + str(rown["OUTBOUND TIME"]) + "|" +
                    str(rown["NOT-READY TIME"]) + "|" +
                    str(rown["RING TIME"]) + "|" + str(rown["LOGIN TIME"]) +
                    "|" + str(rown["AHT"]) + "|" +
                    str(rown["OCUPANCY"]).encode('utf-8') + "|" +
                    str(rown["AUX TIME"]) + "|" + str(row.id_cliente) + "|" +
                    str(row.cartera).encode('utf-8') + "\n")

    file.close()
    blob.upload_from_filename(ruta_completa)
    time.sleep(10)
    ejecutar = agent_status_beam.run(
        output, KEY_REPORT
    )  #[[[[[[[[[[[[[[[[[[***********************************]]]]]]]]]]]]]]]]]]
    time.sleep(60)

    return ("Se acaba de ejecutar el proceso de " + KEY_REPORT +
            " Para actualizar desde: " + dateini + " hasta " + dateend)