def get_connection(self): config = conf('config') if self.data_source in ['postgresql', 'awsredshift', 'mysql']: server, db, user, pw, port = str(config['db_connection']['server']), str(config['db_connection']['db']), \ str(config['db_connection']['user']), str(config['db_connection']['password']),\ int(config['db_connection']['port']) if self.data_source == 'mysql': from mysql import connector self.conn = connector.connect(host=server, database=db, user=user, password=pw) if self.data_source in ['postgresql', 'awsredshift']: import psycopg2 self.conn = psycopg2.connect(user=user, password=pw, host=server, port=port, database=db) if self.data_source == 'googlebigquery': from google.cloud.bigquery.client import Client os.environ['GOOGLE_APPLICATION_CREDENTIALS'] = join( conf('data_main_path'), "", config['db_connection']['db']) self.conn = Client() print("db connection is done!")
def _diff_tables(work_item: DbDiffWorkItem) -> None: bigquery_client = Client(_http=authorized_session()) table_name = work_item.table_name primary_key = f"{work_item.table_name}_id" except_clause = f",".join(work_item.except_fields) query = f""" WITH argo AS ( SELECT * EXCEPT ({except_clause}) FROM `broad-jade-dev-data.datarepo_{work_item.argo_dataset_name}.{table_name}` ), dagster AS ( SELECT * EXCEPT ({except_clause}) FROM `broad-jade-dev-data.datarepo_{work_item.dagster_dataset_name}.{table_name}` ) SELECT * FROM argo FULL OUTER JOIN dagster ON argo.{primary_key} = dagster.{primary_key} WHERE to_json_string(argo) != to_json_string(dagster) """ result = bigquery_client.query(query, None).result() rows = [row for row in result] if rows: print(f"❌ diff in {table_name}") for row in rows: print(row) else: print(f"✅ no diff in {table_name}")
def _make_client(project="test-project", connection=None): from google.cloud.bigquery.client import Client if connection is None: connection = _make_connection() client = Client(project=project, credentials=_make_credentials(), _http=object()) client._connection = connection return client
def _configure_gcp_client(self, query_job_config): """Configure GCP client.""" logger.info('Storing BigQuery Auth Credentials') os.environ['GOOGLE_APPLICATION_CREDENTIALS'] = SETTINGS.bigquery_credentials_filepath logger.info('Creating new query job configuration') if query_job_config: self.query_job_config = query_job_config else: self.query_job_config = QueryJobConfig(use_legacy_sql=False, use_query_cache=True) self.client = Client(default_query_job_config=self.query_job_config)
def get_google_bigquery_api_client(): """ Return the Google BigQuery API client. Returns: google_bigquery_client: google.cloud.bigquery.client.Client instance. """ service_account_credentials = getattr( settings, 'OPR_GOOGLE_SERVICE_ACCOUNT_CREDENTIALS', {}) google_project_id = getattr(settings, 'OPR_GOOGLE_CLOUD_PROJECT_ID', '') if not (service_account_credentials or google_project_id): logger.error( 'Google Service Account credentials or porject ID were not provided.' ) raise GoogleBigQueryInformationError( 'Google Service Account credentials or porject ID were not provided.' ) credentials = service_account.Credentials.from_service_account_info( service_account_credentials, scopes=BIGQUERY_API_SCOPES, ) return Client( project=google_project_id, credentials=credentials, )
def bqtk_client() -> Client: """BigQuery client as a fixture Returns: Client: BigQuery client """ return Client(location="EU")
class Bigquery(): """Base big query class.""" def __init__(self, query_job_config=None): """Initialize big query object.""" self.client = None self.job_query_obj = None self._configure_gcp_client(query_job_config) def _configure_gcp_client(self, query_job_config): """Configure GCP client.""" logger.info('Storing BigQuery Auth Credentials') os.environ['GOOGLE_APPLICATION_CREDENTIALS'] = SETTINGS.bigquery_credentials_filepath logger.info('Creating new query job configuration') if query_job_config: self.query_job_config = query_job_config else: self.query_job_config = QueryJobConfig(use_legacy_sql=False, use_query_cache=True) self.client = Client(default_query_job_config=self.query_job_config) def run(self, query): """Run the bigquery synchronously.""" if self.client and query: self.job_query_obj = self.client.query(query, job_config=self.query_job_config) return self.job_query_obj.job_id else: raise Exception('Client or query missing') def get_result(self): """Get the result of the job.""" assert self.job_query_obj is not None, 'Job is not initialized' yield from self.job_query_obj.result()
def get_gcp_daily_total_cost(client: Client) -> dict: """Gets cost data for the past day for all projects. Returns: Cost data, including currency and date (ISO 8601). """ query = (f""" SELECT SUM(cost) as cost_sum, currency as currency, DATE_SUB(CURRENT_DATE(), INTERVAL 1 DAY) as date, FROM {SETTINGS.PROJECT_ID}.{SETTINGS.DATA_SET}.{SETTINGS.TABLE_NAME} WHERE CAST(DATE( _PARTITIONTIME) AS DATE) = DATE_SUB(CURRENT_DATE(), INTERVAL 1 DAY) GROUP BY currency LIMIT 1; """) query_job = client.query(query) rows_iter = query_job.result(max_results=1) rows = list(rows_iter) if rows: return { 'cost_sum': round_cost_value(rows[0].cost_sum), 'currency': rows[0].currency, 'date': rows[0].date.isoformat() } one_month_ago = date.today().replace(day=1) - timedelta(days=1) return { # Default, when there's no data. 'cost_sum': 0.0, 'currency': '', 'date': f'{one_month_ago.year}-{one_month_ago.month}' }
def get_gcp_monthly_total_cost(client: Client) -> dict: """Gets cost data for the past month for all projects. Returns: Cost data, including currency and (month) date (ISO 8601). """ query = (f""" SELECT SUM(cost) as cost_sum, currency as currency, TIMESTAMP_TRUNC(CURRENT_TIMESTAMP(), MONTH, 'UTC') as month FROM {SETTINGS.PROJECT_ID}.{SETTINGS.DATA_SET}.{SETTINGS.TABLE_NAME} WHERE _PARTITIONTIME BETWEEN TIMESTAMP_TRUNC( CURRENT_TIMESTAMP(), MONTH, 'UTC') AND TIMESTAMP_TRUNC(CURRENT_TIMESTAMP(), DAY, 'UTC') GROUP BY currency LIMIT 1; """) query_job = client.query(query) rows_iter = query_job.result(max_results=1) rows = list(rows_iter) return { 'cost_sum': round_cost_value(rows[0].cost_sum), 'currency': rows[0].currency, 'date': rows[0].month.isoformat() }
def main(): client = Client() for example in _find_examples(): to_delete = [] print('%-30s: %s' % (example.func_name, example.func_doc)) try: example(client, to_delete) except AssertionError as e: print(' FAIL: %s' % (e, )) except Exception as e: # pylint: disable=broad-except print(' ERROR: %r' % (e, )) for item in to_delete: item.delete()
def bigquery_client( db: typing.Union[str, mara_db.dbs.BigQueryDB] ) -> 'google.cloud.bigquery.client.Client': """Get an bigquery client for a bq database alias""" from google.cloud.bigquery.client import Client if isinstance(db, str): db = mara_db.dbs.db(db) credentials = bigquery_credentials(db) return Client(project=credentials.project_id, credentials=credentials, location=db.location)
def text_analytic(): os.environ['GOOGLE_APPLICATION_CREDENTIALS'] = (r"C:\Users\MAIH004\My Project-4f1de5feb39a.json") bq_client = Client() #result = GetSentimentScore() text_list = SearchResult.query.all() a = [] sum = 0; density = [0] *10 print(density) for i in text_list: record = {} sum += i.score; if (i.score * 100) <= 10: density[0] += 1 if ((i.score * 100) > 10) and ((i.score * 100) <= 20): density[1] += 1 if ((i.score * 100) > 20) and ((i.score * 100) <= 30): density[2] += 1 if ((i.score * 100) > 30) and ((i.score * 100) <= 40): density[3] += 1 if ((i.score * 100) > 40) and ((i.score * 100) <= 50): density[4] += 1 if ((i.score * 100) > 50) and ((i.score * 100) <= 60): density[5] += 1 if ((i.score * 100) > 60) and ((i.score * 100) <= 70): density[6] += 1 if ((i.score * 100) > 70) and ((i.score * 100) <= 80): density[7] += 1 if ((i.score * 100) > 80) and ((i.score * 100) <= 90): density[8] += 1 if ((i.score * 100) > 90) and ((i.score * 100) <= 100): density[9] += 1 record['id'] = i.id record['source'] = i.sourceName record['title'] = i.title record['score'] = i.score record['content'] = i.content record['url'] = i.url record['description'] = i.description a.append(record) print(density) averageScore = round(float(sum / 20),2) JSONResult = {} JSONResult['average'] = averageScore JSONResult['list'] = a JSONResult['chartData'] = density return json.dumps(JSONResult)
def get_cost_filter_project_daily_interval(client: Client, project_id: str, days_ago: int) -> dict: """Gets cost data for a specified number of days ago for a specified project. Args: project_id: The project ID of which to retrieve cost data days_ago: The number of days ago from today to retrieve cost data Returns: Cost data, including currency and date. """ query = (f""" SELECT SUM(cost) as cost, currency as currency, TIMESTAMP_TRUNC(TIMESTAMP_SUB( CURRENT_TIMESTAMP(), INTERVAL {days_ago} DAY),DAY ) as date FROM {SETTINGS.PROJECT_ID}.{SETTINGS.DATA_SET}.{SETTINGS.TABLE_NAME} WHERE _PARTITIONTIME BETWEEN TIMESTAMP_TRUNC(TIMESTAMP_SUB( CURRENT_TIMESTAMP(), INTERVAL {days_ago} DAY),DAY ) AND TIMESTAMP_TRUNC(TIMESTAMP_SUB( CURRENT_TIMESTAMP(), INTERVAL {days_ago} DAY),DAY ) AND project.id='{project_id}' GROUP BY currency LIMIT 1; """) query_job = client.query(query) rows_iter = query_job.result(max_results=1) rows = list(rows_iter) if rows: return { 'cost': round_cost_value(rows[0].cost), 'currency': rows[0].currency, 'date': rows[0].date.isoformat() } return { # Default, when there's no data. 'cost': 0.0, 'currency': '', 'date': (date.today() - timedelta(days=days_ago)).isoformat() }
def get_project_ids_with_monthly_cost(client: Client) -> list: """Gets a list of all project IDs within the billing data from BigQuery. Does not include projects for which cost was 0 in the past month. Returns: Project ID(s) """ query = (f""" SELECT DISTINCT project.id AS pid FROM {SETTINGS.PROJECT_ID}.{SETTINGS.DATA_SET}.{SETTINGS.TABLE_NAME} WHERE _PARTITIONTIME BETWEEN TIMESTAMP_TRUNC( CURRENT_TIMESTAMP(), MONTH, 'UTC') AND TIMESTAMP_TRUNC(CURRENT_TIMESTAMP(), DAY, 'UTC') AND project.id IS NOT NULL AND cost > 0; """) query_job = client.query(query) return [row.pid for row in query_job.result() if row.pid is not None]
def upload_blob(bucket_name, source_file_name, destination_blob_name): from google.cloud import storage from google.cloud import bigquery from google.cloud.bigquery.client import Client creds = '/home/pi/developemnt/vive-gpio/googlecreds.json' os.environ['GOOGLE_APPLICATION_CREDENTIALS'] = creds bq_client = Client() #client = bigquery.Client.from_service_account_json(os.environ['HOME'] + #"/.google/googlecreds.json" """Uploads a file to the bucket.""" print("Connecting to Google Cloud Stoarage") storage_client = storage.Client() bucket = storage_client.get_bucket(bucket_name) blob = bucket.blob(destination_blob_name) print("Uploading file") blob.upload_from_filename(source_file_name) blob.make_public() print('File {} uploaded to {}.'.format(source_file_name, destination_blob_name))
def get_gcp_project_daily_top_services( client: Client, project_id: str, number: int = SETTINGS.NUMBER_OF_TOP_SERVICES_TO_INVESTIGATE) -> list: """Gets data on daily top `number` of the highest costing services on the given `project_id`. Returns: Data on top services and cost """ query = (f""" SELECT SUM(cost) as cost, service.description as service_desc, currency as currency, FROM {SETTINGS.PROJECT_ID}.{SETTINGS.DATA_SET}.{SETTINGS.TABLE_NAME} WHERE _PARTITIONTIME BETWEEN TIMESTAMP_TRUNC( TIMESTAMP_SUB(CURRENT_TIMESTAMP(), INTERVAL 24 HOUR),DAY) AND TIMESTAMP_TRUNC(CURRENT_TIMESTAMP(), DAY) AND project.id='{project_id}' GROUP BY service_desc, currency ORDER BY(cost) DESC LIMIT {number}; """) query_job = client.query(query) rows_iter = query_job.result() top_services = [] for row in rows_iter: top_services.append({ 'service_name': row['service_desc'], 'cost': round_cost_value(row['cost']), 'currency': row['currency'] }) return top_services
from __future__ import division import re import sys import os from google.cloud import speech from google.cloud.speech import enums from google.cloud.speech import types import pyaudio from six.moves import queue from google.cloud.bigquery.client import Client import time os.environ['GOOGLE_APPLICATION_CREDENTIALS'] = r'../auth-goog.json' bq_client = Client() # Audio recording parameters RATE = 24000 CHUNK = int(RATE / 10) # 100ms class MicrophoneStream(object): """Opens a recording stream as a generator yielding the audio chunks.""" def __init__(self, rate, chunk): self._rate = rate self._chunk = chunk # Create a thread-safe buffer of audio data self._buff = queue.Queue() self.closed = True def __enter__(self):
class GetData: def __init__(self, data_source=None, date=None, data_query_path=None, time_indicator=None, feature=None, test=None): self.data_source = data_source if data_source is not None else 'csv' self.data_query_path = data_query_path self.time_indicator = time_indicator self.feature = feature self.conn = None self.data = pd.DataFrame() self.nrows = test self.date = date self.query = data_query_path def get_connection(self): config = conf('config') if self.data_source in ['postgresql', 'awsredshift', 'mysql']: server, db, user, pw, port = str(config['db_connection']['server']), str(config['db_connection']['db']), \ str(config['db_connection']['user']), str(config['db_connection']['password']),\ int(config['db_connection']['port']) if self.data_source == 'mysql': from mysql import connector self.conn = connector.connect(host=server, database=db, user=user, password=pw) if self.data_source in ['postgresql', 'awsredshift']: import psycopg2 self.conn = psycopg2.connect(user=user, password=pw, host=server, port=port, database=db) if self.data_source == 'googlebigquery': from google.cloud.bigquery.client import Client os.environ['GOOGLE_APPLICATION_CREDENTIALS'] = join( conf('data_main_path'), "", config['db_connection']['db']) self.conn = Client() print("db connection is done!") def convert_date(self): try: self.data[self.time_indicator] = self.data[ self.time_indicator].apply(lambda x: parse(str(x))) except Exception as e: print(e) def convert_feature(self): try: self.data[self.feature] = self.data[self.feature].apply( lambda x: convert_feature(x)) except Exception as e: print(e) def check_data_with_filtering(self): if self.data_source in ('json', 'yaml', 'csv'): self.query = lambda x: x.query(self.time_indicator + " > '" + str( self.date) + "'") if self.date is not None else x if self.data_source in [ 'mysql', 'postgresql', 'awsredshift', 'googlebigquery' ]: if self.date: self.query = "SELECT * FROM (" + self.data_query_path + ") AS T WHERE T." + self.time_indicator + " >= '" + str( self.date) + "' " def query_data_source(self): self.check_data_with_filtering() # import data via pandas if self.data_source in ['mysql', 'postgresql', 'awsredshift']: self.get_connection() self.data = pd.read_sql( self.query + " LIMIT " + str(self.nrows) if self.nrows else self.query, self.conn) # import data via google if self.data_source == 'googlebigquery': self.get_connection() self.data = self.conn.query( self.query + " LIMIT " + str(self.nrows) if self.nrows else self.query).to_dataframe() # import via pandas if self.data_source == 'csv': try: for sep in [',', ';', ':']: self.data = pd.read_csv(filepath_or_buffer=join( conf('data_main_path'), self.data_query_path), error_bad_lines=False, encoding="ISO-8859-1", sep=sep, nrows=self.nrows) if len(self.data.columns) > 1: break except Exception as e: print(e) if self.data_source == 'json': self.data = read_write_to_json(conf('directory'), self.data_query_path, None, is_writing=False) if self.data_source == 'yaml': self.data = read_yaml(conf('data_main_path'), self.data_query_path) if self.data_source in ('json', 'yaml', 'csv'): self.data = self.query(self.data) def data_execute(self): self.query_data_source() self.convert_date() self.convert_feature()
import os from google.cloud.bigquery.client import Client import google.auth from google.cloud.bigquery.schema import SchemaField from google.cloud import bigquery import datetime, time import json from sklearn.preprocessing import MinMaxScaler from sklearn.metrics import r2_score, mean_absolute_error, mean_squared_error import json os.environ["GOOGLE_APPLICATION_CREDENTIALS"] = 'connect_bigquery.json' credentials, project = google.auth.default() credentials = credentials.with_scopes( ['https://www.googleapis.com/auth/bigquery']) client = Client(credentials=credentials) dataset = client.dataset('ahamove_archive') sql = """ select accept_time, board_time, accept_distance as distance, json_extract_scalar(_extra_props,'$.accept_lat') as accept_lat, json_extract_scalar(_extra_props,'$.accept_lng') as accept_lng, json_extract_scalar(_extra_props,'$.pickup_lat') as pickup_lat, json_extract_scalar(_extra_props,'$.pickup_lng') as pickup_lng, date_diff (date (o.order_time),date(s.first_complete_time),day) as experience from ahamove_archive.order_archive o left join ahamove_archive.supplier s on o.supplier_id = s.id where order_time >= '2019-03-01' and order_time<='2019-03-02' and o.status = 'COMPLETED'
from google.cloud.bigquery.client import Client from pointz import settings client = Client.from_service_account_json(settings.BIGQUERY_SECRET_JSON) dataset_pointz = client.dataset('pointz')
import os import csv from google.cloud import bigquery from google.cloud.bigquery.client import Client ########### Bibliotecas ############### #import commons functions from configuration_functions import * #################################################################### os.environ['GOOGLE_APPLICATION_CREDENTIALS'] = get_google_credentials() client = Client() hive_conn = hive_connect() def get_name_database_bigquery(): return "hive" def get_name_table_bigquery(): return "email_hadoop" def get_schema_email_bigquery(): return [ bigquery.SchemaField('id_email', 'STRING', mode='required'),
# #print(start_object) # # current date and time # now = datetime.now() # timestamp = int(datetime.timestamp(now)) # with open("time.txt", "w") as f: # f.write(str(timestamp)) # c = datetime.fromtimestamp(timestamp) # current_object = c.strftime('%Y-%m-%d %H:%M:%S') # #print(c) # print(current_object) os.environ['GOOGLE_APPLICATION_CREDENTIALS'] = 'api_key.json' bq_client = Client() query = "select full_content,url, publishedAt from `jadgs-262219.newsapi.news_articles`" # query = "select url, full_content from `jadgs-262219.newsapi.news_articles` where publishedAt >= '" + start_object + "' and publishedAt < '" + current_object + "'" query_job = bq_client.query(query) results = query_job.result() line_counter = 0 embedding_counter = 0 for row in results: if row[0] is not None: text = row[0] url = row[1] publishedAt = datetime.timestamp(row[2]) doc = nlp(text) clusters = doc._.coref_clusters
from gensim.sklearn_api import hdp from gensim.test.utils import datapath import os from google.cloud.bigquery.client import Client from gensim.corpora.dictionary import Dictionary from gensim.models.ldamodel import LdaModel import re from nltk.corpus import stopwords from nltk.tokenize import sent_tokenize, word_tokenize from gensim.models.wrappers import LdaMallet import json ######################## Fetch Data#########################33 os.environ[ 'GOOGLE_APPLICATION_CREDENTIALS'] = 'School of CIS Publication-c642ecab1ca7.json' bq_client = Client() query = ( "select eid, title, abstract, area from cis_publications.pubs_metadata_by_scopus" ) pubs_table = bq_client.query( query, # Location must match that of the dataset(s) referenced in the query. location="US", ) # API request - starts the query ################################# Data Input ###################################### # with open("20191021-pubs.json","r",encoding='utf8') as file: # pubs_table = json.load(file) # Get needed data columns from table into dict: pubs_data = OrderedDict()
from google.cloud.bigquery.client import Client import os os.environ['GOOGLE_APPLICATION_CREDENTIALS'] = 'projectig.json' bq_client = Client() query = """SELECT DISTINCT * FROM `projectig-247222.projectig.historical`""" df = bq_client.query(query).to_dataframe()
features_speechiness= features[0]['speechiness'] features_tempo= features[0]['tempo'] features_time_signature = features[0]['time_signature'] features_valence = features[0]['valence'] # Fila a insertar en la tabla rows_to_insert = [(artista,artist_followers,str(artist_genres),artist_id,artist_popularity, album_type,album_genres,album_id,album_label,album_name,album_popularity, album_release_date,album_realease_date_precision,track_name,str(track_avalible_markets), str(track_explicit),track_id,track_popularity,track_number,features_acousticness,features_analysis_url, features_danceability,features_duration_ms,features_energy,features_instrumentalness,features_key,features_liveness, features_loudness,features_mode,features_speechiness,features_tempo,features_time_signature,features_valence)] # Autenticación os.environ[ 'GOOGLE_APPLICATION_CREDENTIALS'] = '../datos/credenciales/MusicProjectTest-042a5c317e41.json' # Conexion con la API de Big Query big_query_client = Client() # Se establece la BBDD dataset_ref = big_query_client.dataset('musicData') # establecemos la tabla table_ref = dataset_ref.table('datos_spotify_{}'.format(letra_fichero)) #table_ref = dataset_ref.table('Test') #Para hacer pruebas de insercciones table = big_query_client.get_table(table_ref) # Insertar datos en Big Query errors = big_query_client.insert_rows(table, rows_to_insert) # API request for error in errors: logging.error('Big Query: ' + error) except Exception as e: logging.error(e) logging.error('-- Artista -> %s, album -> %s y cancion -> %s' %(artista, album_id, track_id)) except Exception as e: logging.error(e)
def tdr_bigquery_client(): return Client()
def twitter(keyword): os.environ['GOOGLE_APPLICATION_CREDENTIALS'] = (r"C:\Users\MAIH004\My Project-4f1de5feb39a.json") bq_client = Client() #search_result = SearchResult.query.all() printResearch(keyword) return json.dumps('abc')
def run(): """Executes the analysis and posts to slack """ client = Client() SETTINGS.load_from_environment() slack_notify(client)
def query_bigquery(query_file_location: str, bq_client: Client) -> List[Row]: query = read_sql_query(query_file_location) query_job = bq_client.query(query) return retrieve_query_results(query_job)
# # # To update data in the already existing BQ table use the code below # # import os from datetime import timedelta from datetime import date from google.cloud import bigquery os.environ["GOOGLE_APPLICATION_CREDENTIALS"] = "C:/.../Credentials.json" from google.cloud.bigquery.client import Client client = Client() sql = """ DELETE FROM `dataset.table_id` WHERE date >= '""" + str(start_date) + """' """ # Start the query, passing in the extra configuration. query_job = client.query(sql) # Make an API request. query_job.result() # Wait for the job to complete. table_id = "project.dataset.table_id" job_config = bigquery.LoadJobConfig( schema=[ bigquery.SchemaField("name", "STRING"),