def read_gbq(query, project_id=None, index_col=None, col_order=None, reauth=False, verbose=True, private_key=None, dialect='legacy', **kwargs): pandas_gbq = _try_import() return pandas_gbq.read_gbq( query, project_id=project_id, index_col=index_col, col_order=col_order, reauth=reauth, verbose=verbose, private_key=private_key, dialect=dialect, **kwargs)
def loadImages(self) -> pd.DataFrame: """Query All the images from the BigQuery Table Returns: Returns a DataFrame with all the images """ df = pandas_gbq.read_gbq( 'SELECT * FROM `sidhouses.osm_data.maps_images`', project_id='sidhouses') return df
def read_sql(query, billing_project_id=None, from_file=False, reauth=False): """Load data from BigQuery using a query. Just a wrapper around pandas.read_gbq Args: query (sql): Valid SQL Standard Query to basedosdados billing_project_id (str): Optional. Project that will be billed. Find your Project ID here https://console.cloud.google.com/projectselector2/home/dashboard reauth (boolean): Optional. Re-authorize Google Cloud Project in case you need to change user or reset configurations. Returns: pd.DataFrame: Query result """ try: return pandas_gbq.read_gbq( query, credentials=credentials(from_file=from_file, reauth=reauth), project_id=billing_project_id, ) except (OSError, ValueError): raise BaseDosDadosException( "\nWe are not sure which Google Cloud project should be billed.\n" "First, you should make sure that you have a Google Cloud project.\n" "If you don't have one, set one up following these steps: \n" "\t1. Go to this link https://console.cloud.google.com/projectselector2/home/dashboard\n" "\t2. Agree with Terms of Service if asked\n" "\t3. Click in Create Project\n" "\t4. Put a cool name in your project\n" "\t5. Hit create\n" "" "Copy the Project ID, (notice that it is not the Project Name)\n" "Now, you have two options:\n" "1. Add an argument to your function poiting to the billing project id.\n" " Like `bd.read_table('br_ibge_pib', 'municipios', billing_project_id=<YOUR_PROJECT_ID>)`\n" "2. You can set a project_id in the environment by running the following command in your terminal: `gcloud config set project <YOUR_PROJECT_ID>`." " Bear in mind that you need `gcloud` installed.") except GenericGBQException as e: if "Reason: 403" in str(e): raise BaseDosDadosException( "\nYou still don't have a Google Cloud Project.\n" "Set one up following these steps: \n" "1. Go to this link https://console.cloud.google.com/projectselector2/home/dashboard\n" "2. Agree with Terms of Service if asked\n" "3. Click in Create Project\n" "4. Put a cool name in your project\n" "5. Hit create\n" "6. Rerun this command with the flag `reauth=True`. \n" " Like `read_table('br_ibge_pib', 'municipios', reauth=True)`" ) else: raise e
def search_in_user_vk_library(username,word='',mode='post',project_id=project_id,credentials=credentials): try: if mode=='post_from_group': group_name,word=word.split(',') word = re.sub("[^а-яА-Яa-zA-Z0-9]", " ", word) words = word.lower().split() words = [w for w in words if not w in stops] words = [stemmer.stem(w) for w in words] if words=='': return "некорректный ввод" Query = f'SELECT * FROM dataset.vk_storage_{username} WHERE group_name=\'{group_name}\' and (post LIKE \'' for word in words: Query+='%{}'.format(word) Query +='%\' or post LIKE \' ' flag=True for word in words: if flag==True: Query+='%{}'.format(word.capitalize()) flag=False else: Query+='%{}'.format(word) Query +='%\')' word = re.sub("[^а-яА-Яa-zA-Z0-9]", " ", word) words = word.lower().split() words = [w for w in words if not w in stops] words = [stemmer.stem(w) for w in words] if words=='': return "некорректный ввод" if mode=='post': Query = f'SELECT * FROM dataset.vk_storage_{username} WHERE post LIKE \'' for word in words: Query+='%{}'.format(word) Query +='%\' or post LIKE \' ' flag=True for word in words: if flag==True: Query+='%{}'.format(word.capitalize()) flag=False else: Query+='%{}'.format(word) Query +='%\'' print(Query) df = gbq.read_gbq(Query, project_id, credentials=credentials) result = df.values.tolist() return (result) except: return []
def rawQuery(self, sql): """this will send the sql to BQ and return the results Args: sql(str): the sql string you care about Returns: DataFrame: a pandas.DataFrame of the results """ df = pandas_gbq.read_gbq(sql) return df
def login(): if request.method == 'GET': return render_template('login.html') else: email = request.form.get('email') password = request.form.get('password') print(email, password) SQL = """ SELECT password FROM `movie.users` WHERE email='%s' """ % (email) try: df = pandas_gbq.read_gbq(SQL) print('success') print(df) if len(df) > 0: if df.iloc[0].password == password: #print('iam here') cur_user = UserMixin() cur_user.id = email login_user(cur_user) session['user_email'] = email SQL = """ SELECT uid FROM `movie.users` WHERE email='%s' """ % (email) df = pandas_gbq.read_gbq(SQL) session['user_id'] = int(df.iloc[0].uid) session.permenant = True return redirect(url_for('index')) else: error = 'email or password is wrong, try again' return render_template('login.html', error=error) else: error = 'user does not exist' return render_template('login.html', error=error) except: error = 'something wrong try again' return render_template('login.html', error=error)
def query(self, query: str, chunksize: int, exact=False) -> Generator[DataFrame, None, None]: """ Query DAO repo and returns a generator of DataFrames with query results. Keyword Arguments: query {str} -- Query string chunksize {int} -- Number of rows of dataframe per chunk exact {bool} -- If false, query orders results and returns chunks Returns: Generator[DataFrame] -- Generator to iterate over DataFrame results. """ if query is None: raise DaoError try: if exact: self.LOGGER.info(f"Querying {self.tablename}: {query}") response_df = pandas_gbq.read_gbq(query, progress_bar_type=None) yield response_df if not exact: offset = 0 # Remove semicolon if exists in original query to add ordering to query query = query.strip(";") while True: add_query = ( f" ORDER BY {self._pkey} LIMIT {chunksize} OFFSET {offset};" ) gbq_query = query + add_query self.LOGGER.info(f"Querying {self.tablename}: {gbq_query}") response_df = pandas_gbq.read_gbq(gbq_query) if response_df.empty: return offset += chunksize yield response_df except Exception as gbq_exp: self.LOGGER.exception(str(gbq_exp)) raise DaoError(gbq_exp)
def loadVillages(self) -> pd.DataFrame: """Query All the villages from the BigQuery Table Generating the Polygons and Points Returns: Returns a DataFrame with all the villages """ df = pandas_gbq.read_gbq('SELECT * FROM sidhouses.osm_data.villages') self.create_polygons(df) self.create_points(df) return df
def author_relationship(name): pandas_gbq.context.credentials = credentials pandas_gbq.context.project = "bigdata-259800" nodes_info = {} nodes, edges = [], [] p_name = name.split(' ') name = ''.join(['%' + i for i in p_name]) + '%' # print(name) SQL = f"SELECT * FROM `bigdata-259800.authorinfo.author_edges` where lower(source) like '%{name.lower()}%' limit 200" df = pandas_gbq.read_gbq(SQL) # print(df) au_info = df.source[0].split('-') source_node = au_info[0] try: contact_info = ''.join(au_info[1:]) except: contact_info = '' nodes_info[source_node] = {'id': 0, 'num': 0, 'info': contact_info} error_symbol = ['none', '@', 'and', 'or'] for i in range(len(df.target)): current_node = df.target[i] split_info = current_node.split('-') node_name = split_info[0] if True in [True for i in error_symbol if i in node_name.lower()]: continue if len(split_info) > 1: c_info = ' '.join(split_info[1:]) else: c_info = '' try: nodes_info[node_name]['num'] += 1 except: nodes_info[node_name] = {'id': i+1, 'num': 1, 'info':c_info} for i in nodes_info.keys(): if i == source_node: title = 'Contact info: ' + nodes_info[i]['info'] + '\n;' + '# Friends: ' + str(len(nodes_info.keys()) - 1) else: title = 'Contact info: ' + nodes_info[i]['info'] nodes.append({'id':nodes_info[i]['id'], 'label': i, 'title': title}) if i != source_node: edges.append({'from': 0, 'to': nodes_info[i]['id'], 'width': nodes_info[i]['num'], 'label': str(nodes_info[i]['num']), 'font': {'size': 0}}) return nodes, edges
def has_tweet_id(tweet_id): q = """ SELECT og_tweet_id FROM twitter.tweets WHERE og_tweet_id = {} """.format(tweet_id) df = pandas_gbq.read_gbq(q, project_id=project_id, credentials=credentials) if df.shape[0] >= 1: return True else: return False return q
def getFreshData(ProjectId): bigquery_sql = " ".join([ "SELECT id, DATE(CAST(created_at AS DATETIME)) AS created, DATE(CAST(updated_at AS DATETIME)) AS updated, status, assignee_id", "FROM `xsolla_summer_school.customer_support`", "WHERE status IN ('closed','solved')", "ORDER BY updated_at" ]) dataframe = pandas_gbq.read_gbq(bigquery_sql, project_id=ProjectId, dialect="standard") return dataframe
def read_gbq(query, project_id='robusta-lab', **kwargs): """ write a dataframe in Google BigQuery """ return pandas_gbq.read_gbq( query, project_id, credentials=_get_credentials_gbq(), **kwargs)
def bq_python(ds, **ags): #pull the data from previous task credentials = ags['task_instance'].xcom_pull(task_ids='bq_connection') #conditional: if not exist, select all, else: select just the newest (check to the bq) query = "select distinct order_id, user_id\ FROM `acube_2019.acube_fintech_final_project_2019`\ limit 10" constellation = pandas_gbq.read_gbq(query=query, project_id='minerva-da-coe', credentials=credentials) print(constellation.head()) return constellation
def connection(request): pandas_gbq.context.credentials = credentials pandas_gbq.context.project = "Your-Project" SQL1 = '' df1 = pandas_gbq.read_gbq(SQL1) SQL2 = '' df2 = pandas_gbq.read_gbq(SQL2) data = {} ''' TODO: Finish the SQL to query the data, it should be limited to 8 rows. Then process them to format below: Format of data: {'n': [xxx, xxx, xxx, xxx], 'e': [{'source': xxx, 'target': xxx}, {'source': xxx, 'target': xxx}, ... ] } ''' return render(request, 'connection.html', data)
def create_bq(self): for dset in self.dsets: df = pandas_gbq.read_gbq("""SELECT * FROM jhu_covid_dset.{} """.format(dset)) df.drop(['province_state', 'lat', 'long'], axis=1, inplace=True) df = df.groupby(['country_region']).sum().T df = self.clean_data(df) # df.drop(df.tail(1).index, inplace=True) pandas_gbq.to_gbq(df, 'torran_covid_dset.{}'.format(dset), if_exists='replace') return
def loadOSMBuildings(self) -> pd.DataFrame: """Query All the buildings from the BigQuery Table Generating the Polygons and Points Returns: Returns a DataFrame with all the buildings """ df = pandas_gbq.read_gbq('SELECT * FROM sidhouses.osm_data.buildings', project_id='sidhouses') # self.create_polygons(df) self.create_points(df) return df
def gbq_query(self, url, str_args): """Queries a GBQ table and returns the output. Args: url: A string that contains a URL. str_args: A tuple of arguments to pass into an SQL query. Returns: A pandas data frame. """ start = datetime.now() print('Started processing query: {}'.format(start)) query = requests.get(url, allow_redirects=True).text.format(*str_args) try: results = pandas_gbq.read_gbq(query, dialect='standard', project_id=self.project, credentials=self.auth2) except pandas_gbq.exceptions.AccessDenied: self.get_authorization() results = pandas_gbq.read_gbq(query, dialect='standard', project_id=self.project, credentials=self.auth2) finish = datetime.now() print("Finished processing query: {}".format(finish)) duration = finish - start time_diff = round(duration.total_seconds(), 2) print('Query returned: {0} results in {1} seconds \n'.format( len(results), time_diff)) return results
def update_tables(project_id: str, new_source_tableid: str = 'raw', destination_tableid: str = 'cities_refined', operation_func=operation_refine_city_data_appendbq): raw = pandas_gbq.read_gbq(f""" SELECT * FROM `{project_id}.{config['dataset_id']}.{new_source_tableid}` WHERE CAST(datetime as DATE) > DATE_SUB(CURRENT_DATE(), INTERVAL 60 DAY) """) hist = pandas_gbq.read_gbq(f""" SELECT * FROM `{project_id}.{config['dataset_id']}.{destination_tableid}` WHERE CAST(datetime as DATE) > DATE_SUB(CURRENT_DATE(), INTERVAL 60 DAY) """) newly_arrived = filter_newly_arrived(raw, hist) if newly_arrived is not None: operation_func(project_id=project_id, destination_tableid=destination_tableid, newly_arrived=newly_arrived) else: print( f"0 rows added to table: {config['dataset_id']}.{destination_tableid}" )
def update_geodemo_chart(activity_selected, state_name, county_name): sql = """ SELECT county_fips_code, prediction_date, new_confirmed, new_confirmed_ground_truth FROM `bigquery-public-data.covid19_public_forecasts.county_14d` WHERE state_name="{state_name}" AND county_name="{county_name}" """.format(state_name=state_name, county_name=county_name) forecast_df = pandas_gbq.read_gbq(sql, project_id=project_id) county_fips_code = forecast_df['county_fips_code'][0] county_pop = int(df_population_2019[df_population_2019['fips'] == county_fips_code]['population']) try: return make_original_property_graph(activity_selected, county_pop, forecast_df) except: return None
def load_t0_from_bq(area, project_id): start_time = time.time() summary_sql = """ SELECT distinct section FROM `ETL.root_sku` WHERE area = "%s" """ %(area) for i in tqdm(range(1), desc='Loading table...'): section_table = pandas_gbq.read_gbq(summary_sql, project_id=project_id) total_time = round((time.time() - start_time) / 60, 1) logger.info("Completed loading of distinct sections table from Bigquery {a} mins...".format(a=total_time)) return section_table
def load_t1_from_bq(project_id): start_time = time.time() summary_sql = """ SELECT * FROM `prediction_results.post_prediction_train_input` """ for i in tqdm(range(1), desc='Loading table...'): hist_promo_table = pandas_gbq.read_gbq(summary_sql, project_id=project_id) total_time = round((time.time() - start_time) / 60, 1) logger.info("Completed loading of historical post promotion table from Bigquery {a} mins...".format(a=total_time)) return hist_promo_table
def query_data(end_date, user_type, output_path): start_date = end_date - timedelta(days=27) start_date = start_date.strftime('%Y-%m-%d') end_date = end_date.strftime('%Y-%m-%d') query = f''' WITH OLD_USER AS( (SELECT user_id, TYPE FROM `ntufbdata.user_type.user_entering_type` WHERE TYPE = '{user_type}') UNION DISTINCT (SELECT user_id, TYPE FROM `ntufbdata.user_type.user_entering_type` WHERE TYPE = 'WHOLE') ), REACTION AS( (SELECT user_id, SPLIT(post_id, '_')[ORDINAL(1)] AS page_id, post_id FROM `ntufbdata.USdata.1000_page_us_user_like_post_201501_to_201611_all` WHERE TIMESTAMP(post_created_date_CT) >= TIMESTAMP('{start_date}') AND TIMESTAMP(post_created_date_CT) <= TIMESTAMP('{end_date}')) UNION DISTINCT (SELECT user_id, SPLIT(post_id, '_')[ORDINAL(1)] AS page_id, post_id FROM `ntufbdata.USdata.politician_us_user_post_like_all` WHERE TIMESTAMP(post_created_date_CT) >= TIMESTAMP('{start_date}') AND TIMESTAMP(post_created_date_CT) <= TIMESTAMP('{end_date}')) ) SELECT user_id, TYPE, STRING_AGG(page_id, ',') AS like_pages, STRING_AGG(CAST(like_time AS STRING), ',') AS like_times FROM( SELECT OLD_USER.user_id, OLD_USER.TYPE, REACTION.page_id, COUNT(*) AS like_time FROM OLD_USER INNER JOIN REACTION ON OLD_USER.user_id = REACTION.user_id GROUP BY OLD_USER.user_id, OLD_USER.TYPE, REACTION.page_id) GROUP BY user_id, TYPE ''' user_like_pages = gbq.read_gbq(query, project_id='ntufbdata') user_like_pages.to_csv(f'{output_path}{end_date}.csv', index=False)
def query_to_df(self, sql_query): """Query sql_query and return results in pandas dataframeself. Parameters ---------- sql_query: string Returns ------- df: pandas.DataFrame """ df = pandas_gbq.read_gbq(sql_query, project_id=self.project_id, credentials=self.credentials) return df
def update_heatmap(state_name, county_name): sql = """ SELECT county_fips_code, prediction_date, new_confirmed, new_confirmed_ground_truth FROM `bigquery-public-data.covid19_public_forecasts.county_14d` WHERE state_name="{state_name}" AND county_name="{county_name}" """.format(state_name=state_name, county_name=county_name) forecast_df = pandas_gbq.read_gbq(sql, project_id=project_id) county_fips_code = forecast_df['county_fips_code'][0] county_pop = int(df_population_2019[df_population_2019['fips'] == county_fips_code]['population']) # Return to original hm(no colored annotation) by resetting try: return generate_forecast_heatmap(county_pop, forecast_df) except: return None
def load_data(self): # sql = ''' # SELECT l.increment_id order_code # , max(ward_sellerboom_score) ward_sellerboom_score # , max(ward_reseller_score) ward_reseller_score # , max(ward_other_score) ward_other_score # , s.* except(order_code, ward_sellerboom_score, ward_reseller_score, ward_other_score) # FROM `tiki-dwh.sherlock.fraud_label_2020405` l # LEFT JOIN `tiki-dwh.sherlock.feature_summary_*` s # ON cast(l.increment_id as string) = s.order_code # GROUP BY 1, 5, 6, 7, 8, 9,10, 11, 12, 13, 14, 15, 16, 17, 18,19,20,21,22,23,24,25,26,27,28,29 # ,30,31,32,33,34,35,36,37,38,39,40,41,42, 43,44,45,46,47,48,49,50,51,52,53,54,55,56,57,58,59 # ''' sql = 'select * from `tiki-dwh.consumer_product.fraud_raw_data`' raw = pandas_gbq.read_gbq(sql, project_id='tiki-dwh', credentials=credentials) return raw
def query(self, sql_query, show_progress=False): """ Run BigTable queries on this dataset. Tables can be referenced in FROM and JOIN SQL clauses using their .full_name attributes injected into a query template. """ if show_progress: progress_bar_type = 'tqdm' else: progress_bar_type = None df = pandas_gbq.read_gbq( sql_query, project_id=self.project_id, progress_bar_type=progress_bar_type, ) return df
def get_dataframe_from_table(self, project, _dataset, _table): # TODO: Adicionar as colunas como parametro para buscar #columns = "PARTNER_ID, PRODUCT_ID, MANUFACTURER_ID, CATEGORY_1, CATEGORY_2, CATEGORY_3, CATEGORY_4, NAME, INTRODUCED_DATE, RETIRED_DATE, UNIT, BRAND, PACKAGE_SIZE, PACKAGE_UNIT, PRIVATE_LABEL_FLAG, GTIN" columns = "*" dtset = _dataset tble = _table print("tabela ==> " + _table) sQuery = "SELECT " + columns + " FROM " + dtset + ".`" + tble + "`" df_return = pandas_gbq.read_gbq(sQuery, project, private_key=self.key, dialect='standard') return df_return
def get_df(self, data_source: GoogleBigQueryDataSource) -> pd.DataFrame: """ Uses Pandas read_gbq method to extract data from Big Query into a dataframe See: http://pandas.pydata.org/pandas-docs/stable/generated/pandas.io.gbq.read_gbq.html Note: The parameter reauth is set to True to force Google BigQuery to reauthenticate the user for each query. This is necessary when extracting multiple data to avoid the error: [Errno 54] Connection reset by peer """ credentials = ( self.credentials.get_google_oauth2_credentials().with_scopes( self.scopes)) return pandas_gbq.read_gbq(query=data_source.query, project_id=self.credentials.project_id, credentials=credentials, dialect=self.dialect)
def main(project_id): # [START bigquery_pandas_gbq_read_gbq_simple] import pandas_gbq # TODO: Set project_id to your Google Cloud Platform project ID. # project_id = "my-project" sql = """ SELECT country_name, alpha_2_code FROM `bigquery-public-data.utility_us.country_code_iso` WHERE alpha_2_code LIKE 'A%' """ df = pandas_gbq.read_gbq(sql, project_id=project_id) # [END bigquery_pandas_gbq_read_gbq_simple] print(df) return df
def load_daily_trans_from_bq(cat, project_id): start_time = time.time() sql_str = """ SELECT sku_root_id, store_id, std_price_per_unit, AVG(avg_sales_qty_per_week) as avg_sales_qty, AVG(actual_price) as actual_price, COUNT(week_start_date) as duration_weeks, STDDEV(avg_sales_qty_per_week) as std_dev_sales_qty FROM (with temp as ( SELECT sku_root_id, store_id, DATE_TRUNC(date, WEEK(MONDAY)) as week_start_date, std_price_per_unit, SUM(total_sale_qty) as avg_sales_qty_per_week, SAFE_DIVIDE(SUM(total_sale_amt), SUM(total_sale_qty)) as actual_price FROM `gum-eroski-dev.ETL.aggregate_daily_transaction_to_sku` WHERE area in ("ALIMENTACION", "FRESCOS") AND category = {c} AND promo_flag = false #AND store_id in ('149','155','157','159','164','165','182','184','185','190','192','201','207','208','209','212','213','5','6','16','22','25','26','28','29','30','31','36','41','46','47','51','52','68','74','86','87','88','96','98','99','101','103','106','108','119','120','125','138','143','144','263','264','266','280','281','282','283','290','300','302','308','316','320','323','326','330','217','219','223','224','230','231','233','234','235','236','238','240','243','245','248','249','259','393','397','400','401','403','410','418','419','420','422','424','427','429','430','475','331','334','335','346','352','357','358','359','371','378','379','380','381','383','385','387','390','476','479','480','484','488','489','495','496','498','501','502','505','510','512','544','546','547','548','549','550','551','552','553','555','556','558','562','587','599','602','607','727','730','733','734','735','736','748','890','996','1000','1362','1373','1374','1392','1393','1499','1501','1504','2003','2005','3001','608','663','664','665','666','668','669','671','673','674','677','678','679','690','691','718','719','720','721','723','726','3057','3058','3059','3090','3091','3092','3095','3097','3098','3100','3102','3104','3105','3106','3107','3108','3109','3110','3111','3112','3113','3114','3116','3004','3008','3011','3013','3017','3020','3023','3024','3025','3026','3027','3028','3029','3034','3036','3042','3045','3049','3052','3053','3054','3055','3056','3177','3183','3188','3189','3194','3196','3203','3204','3206','3207','3208','3209','3213','3214','3219','3224','3226','3228','3229','3230','3232','3235','3238','3117','3118','3119','3120','3121','3122','3123','3125','3126','3127','3128','3129','3130','3131','3138','3139','3154','3155','3156','3162','3163','3165','3175','3239','3240','3242','3244','3245','3247','3249','3250','3252','3254','3256','3257','3259','3261','3262','3263','3264','3266','3268','3294','3295','3297','3298','3299','3368','3369','3370','3371','3372','3373','3382','3384','3387','3643','3978','3979','3981','3982','3984','3986','3987','3988','3989','3991','3992','3994','3995','3644','3646','3647','3648','3885','3886','3888','3902','3906','3907','3908','3911','3912','3913','3914','3917','3919','3922','3971','3972','3975','3976','3977','4264','4273','4277','4296','4297','4299','4357','4360','4361','4041','4047','4090','4091','4102','4103','4106','4111','4128','4134','4203','4247','4261','4484','4600','4369','4371','4373','4374','4382','4384','4388','4390','4469','4705','4749','4750','4751','4752','4753','4754','4755','4756','4757','4758','4759','4761','4763','4764','4767','4768','4785','4786','4935','4937','5382','6413','6414','6438','6483','7514','7564','7565','7566','7567','7569','7573','8122','8133','8143','8144','8149','8206','8212','8216','8219','8221','9050','9059','9064','6767','6768','9891','271','288','262','3985','3990','4395','6136','6282','6283','6284','7575','8121','8127','8135','8211','9026','9030','9061','9706','9803','9877','9879','9887','9889','9959','210','433','5007','5091','5106','5111','5301','5318','5725','5744','7444','5016','5086','5371','175','187','202','250','399','445','5009','5021','5040','5052','5083','5908','7423') AND total_sale_qty <> 0 GROUP BY sku_root_id, week_start_date, store_id, std_price_per_unit ) SELECT * from temp WHERE EXTRACT(MONTH from week_start_date) <> 12 AND SAFE_DIVIDE(ABS(actual_price-std_price_per_unit),std_price_per_unit)<0.1) GROUP BY sku_root_id, store_id, std_price_per_unit """.format(c=cat) start = time.time() for i in tqdm(range(1), desc='Loading table...'): category_table = pandas_gbq.read_gbq(sql_str, project_id=project_id) total_time = round((time.time() - start_time) / 60, 1) logger.info( "Completed loading of category table from Bigquery {a} mins...".format( a=total_time)) return category_table
def read_gbq(query, project_id=None, index_col=None, col_order=None, reauth=False, verbose=True, private_key=None, dialect='legacy', **kwargs): r"""Load data from Google BigQuery. The main method a user calls to execute a Query in Google BigQuery and read results into a pandas DataFrame. Google BigQuery API Client Library v2 for Python is used. Documentation is available `here <https://developers.google.com/api-client-library/python/apis/bigquery/v2>`__ Authentication to the Google BigQuery service is via OAuth 2.0. - If "private_key" is not provided: By default "application default credentials" are used. If default application credentials are not found or are restrictive, user account credentials are used. In this case, you will be asked to grant permissions for product name 'pandas GBQ'. - If "private_key" is provided: Service account credentials will be used to authenticate. Parameters ---------- query : str SQL-Like Query to return data values project_id : str Google BigQuery Account project ID. index_col : str (optional) Name of result column to use for index in results DataFrame col_order : list(str) (optional) List of BigQuery column names in the desired order for results DataFrame reauth : boolean (default False) Force Google BigQuery to reauthenticate the user. This is useful if multiple accounts are used. verbose : boolean (default True) Verbose output private_key : str (optional) Service account private key in JSON format. Can be file path or string contents. This is useful for remote server authentication (eg. jupyter iPython notebook on remote host) dialect : {'legacy', 'standard'}, default 'legacy' 'legacy' : Use BigQuery's legacy SQL dialect. 'standard' : Use BigQuery's standard SQL (beta), which is compliant with the SQL 2011 standard. For more information see `BigQuery SQL Reference <https://cloud.google.com/bigquery/sql-reference/>`__ **kwargs : Arbitrary keyword arguments configuration (dict): query config parameters for job processing. For example: configuration = {'query': {'useQueryCache': False}} For more information see `BigQuery SQL Reference <https://cloud.google.com/bigquery/docs/reference/rest/v2/jobs#configuration.query>`__ Returns ------- df: DataFrame DataFrame representing results of query """ pandas_gbq = _try_import() return pandas_gbq.read_gbq( query, project_id=project_id, index_col=index_col, col_order=col_order, reauth=reauth, verbose=verbose, private_key=private_key, dialect=dialect, **kwargs)
def read_gbq(query, project_id=None, index_col=None, col_order=None, reauth=False, verbose=None, private_key=None, dialect='legacy', **kwargs): """ Load data from Google BigQuery. This function requires the `pandas-gbq package <https://pandas-gbq.readthedocs.io>`__. Authentication to the Google BigQuery service is via OAuth 2.0. - If "private_key" is not provided: By default "application default credentials" are used. If default application credentials are not found or are restrictive, user account credentials are used. In this case, you will be asked to grant permissions for product name 'pandas GBQ'. - If "private_key" is provided: Service account credentials will be used to authenticate. Parameters ---------- query : str SQL-Like Query to return data values. project_id : str Google BigQuery Account project ID. index_col : str, optional Name of result column to use for index in results DataFrame. col_order : list(str), optional List of BigQuery column names in the desired order for results DataFrame. reauth : boolean, default False Force Google BigQuery to re-authenticate the user. This is useful if multiple accounts are used. private_key : str, optional Service account private key in JSON format. Can be file path or string contents. This is useful for remote server authentication (eg. Jupyter/IPython notebook on remote host). dialect : str, default 'legacy' SQL syntax dialect to use. Value can be one of: ``'legacy'`` Use BigQuery's legacy SQL dialect. For more information see `BigQuery Legacy SQL Reference <https://cloud.google.com/bigquery/docs/reference/legacy-sql>`__. ``'standard'`` Use BigQuery's standard SQL, which is compliant with the SQL 2011 standard. For more information see `BigQuery Standard SQL Reference <https://cloud.google.com/bigquery/docs/reference/standard-sql/>`__. verbose : boolean, deprecated *Deprecated in Pandas-GBQ 0.4.0.* Use the `logging module to adjust verbosity instead <https://pandas-gbq.readthedocs.io/en/latest/intro.html#logging>`__. kwargs : dict Arbitrary keyword arguments. configuration (dict): query config parameters for job processing. For example: configuration = {'query': {'useQueryCache': False}} For more information see `BigQuery SQL Reference <https://cloud.google.com/bigquery/docs/reference/rest/v2/jobs#configuration.query>`__ Returns ------- df: DataFrame DataFrame representing results of query. See Also -------- pandas_gbq.read_gbq : This function in the pandas-gbq library. pandas.DataFrame.to_gbq : Write a DataFrame to Google BigQuery. """ pandas_gbq = _try_import() return pandas_gbq.read_gbq( query, project_id=project_id, index_col=index_col, col_order=col_order, reauth=reauth, verbose=verbose, private_key=private_key, dialect=dialect, **kwargs)
def read_gbq(query, project_id=None, index_col=None, col_order=None, reauth=False, private_key=None, auth_local_webserver=False, dialect='legacy', location=None, configuration=None, verbose=None): """ Load data from Google BigQuery. This function requires the `pandas-gbq package <https://pandas-gbq.readthedocs.io>`__. See the `How to authenticate with Google BigQuery <https://pandas-gbq.readthedocs.io/en/latest/howto/authentication.html>`__ guide for authentication instructions. Parameters ---------- query : str SQL-Like Query to return data values. project_id : str, optional Google BigQuery Account project ID. Optional when available from the environment. index_col : str, optional Name of result column to use for index in results DataFrame. col_order : list(str), optional List of BigQuery column names in the desired order for results DataFrame. reauth : boolean, default False Force Google BigQuery to re-authenticate the user. This is useful if multiple accounts are used. private_key : str, optional Service account private key in JSON format. Can be file path or string contents. This is useful for remote server authentication (eg. Jupyter/IPython notebook on remote host). auth_local_webserver : boolean, default False Use the `local webserver flow`_ instead of the `console flow`_ when getting user credentials. .. _local webserver flow: http://google-auth-oauthlib.readthedocs.io/en/latest/reference/google_auth_oauthlib.flow.html#google_auth_oauthlib.flow.InstalledAppFlow.run_local_server .. _console flow: http://google-auth-oauthlib.readthedocs.io/en/latest/reference/google_auth_oauthlib.flow.html#google_auth_oauthlib.flow.InstalledAppFlow.run_console *New in version 0.2.0 of pandas-gbq*. dialect : str, default 'legacy' SQL syntax dialect to use. Value can be one of: ``'legacy'`` Use BigQuery's legacy SQL dialect. For more information see `BigQuery Legacy SQL Reference <https://cloud.google.com/bigquery/docs/reference/legacy-sql>`__. ``'standard'`` Use BigQuery's standard SQL, which is compliant with the SQL 2011 standard. For more information see `BigQuery Standard SQL Reference <https://cloud.google.com/bigquery/docs/reference/standard-sql/>`__. location : str, optional Location where the query job should run. See the `BigQuery locations documentation <https://cloud.google.com/bigquery/docs/dataset-locations>`__ for a list of available locations. The location must match that of any datasets used in the query. *New in version 0.5.0 of pandas-gbq*. configuration : dict, optional Query config parameters for job processing. For example: configuration = {'query': {'useQueryCache': False}} For more information see `BigQuery REST API Reference <https://cloud.google.com/bigquery/docs/reference/rest/v2/jobs#configuration.query>`__. verbose : None, deprecated Deprecated in Pandas-GBQ 0.4.0. Use the `logging module to adjust verbosity instead <https://pandas-gbq.readthedocs.io/en/latest/intro.html#logging>`__. Returns ------- df: DataFrame DataFrame representing results of query. See Also -------- pandas_gbq.read_gbq : This function in the pandas-gbq library. pandas.DataFrame.to_gbq : Write a DataFrame to Google BigQuery. """ pandas_gbq = _try_import() return pandas_gbq.read_gbq( query, project_id=project_id, index_col=index_col, col_order=col_order, reauth=reauth, verbose=verbose, private_key=private_key, auth_local_webserver=auth_local_webserver, dialect=dialect, location=location, configuration=configuration)
def read_gbq(query, project_id=None, index_col=None, col_order=None, reauth=False, auth_local_webserver=False, dialect=None, location=None, configuration=None, credentials=None, use_bqstorage_api=None, private_key=None, verbose=None): """ Load data from Google BigQuery. This function requires the `pandas-gbq package <https://pandas-gbq.readthedocs.io>`__. See the `How to authenticate with Google BigQuery <https://pandas-gbq.readthedocs.io/en/latest/howto/authentication.html>`__ guide for authentication instructions. Parameters ---------- query : str SQL-Like Query to return data values. project_id : str, optional Google BigQuery Account project ID. Optional when available from the environment. index_col : str, optional Name of result column to use for index in results DataFrame. col_order : list(str), optional List of BigQuery column names in the desired order for results DataFrame. reauth : boolean, default False Force Google BigQuery to re-authenticate the user. This is useful if multiple accounts are used. auth_local_webserver : boolean, default False Use the `local webserver flow`_ instead of the `console flow`_ when getting user credentials. .. _local webserver flow: http://google-auth-oauthlib.readthedocs.io/en/latest/reference/google_auth_oauthlib.flow.html#google_auth_oauthlib.flow.InstalledAppFlow.run_local_server .. _console flow: http://google-auth-oauthlib.readthedocs.io/en/latest/reference/google_auth_oauthlib.flow.html#google_auth_oauthlib.flow.InstalledAppFlow.run_console *New in version 0.2.0 of pandas-gbq*. dialect : str, default 'legacy' Note: The default value is changing to 'standard' in a future verion. SQL syntax dialect to use. Value can be one of: ``'legacy'`` Use BigQuery's legacy SQL dialect. For more information see `BigQuery Legacy SQL Reference <https://cloud.google.com/bigquery/docs/reference/legacy-sql>`__. ``'standard'`` Use BigQuery's standard SQL, which is compliant with the SQL 2011 standard. For more information see `BigQuery Standard SQL Reference <https://cloud.google.com/bigquery/docs/reference/standard-sql/>`__. .. versionchanged:: 0.24.0 location : str, optional Location where the query job should run. See the `BigQuery locations documentation <https://cloud.google.com/bigquery/docs/dataset-locations>`__ for a list of available locations. The location must match that of any datasets used in the query. *New in version 0.5.0 of pandas-gbq*. configuration : dict, optional Query config parameters for job processing. For example: configuration = {'query': {'useQueryCache': False}} For more information see `BigQuery REST API Reference <https://cloud.google.com/bigquery/docs/reference/rest/v2/jobs#configuration.query>`__. credentials : google.auth.credentials.Credentials, optional Credentials for accessing Google APIs. Use this parameter to override default credentials, such as to use Compute Engine :class:`google.auth.compute_engine.Credentials` or Service Account :class:`google.oauth2.service_account.Credentials` directly. *New in version 0.8.0 of pandas-gbq*. .. versionadded:: 0.24.0 use_bqstorage_api : bool, default False Use the `BigQuery Storage API <https://cloud.google.com/bigquery/docs/reference/storage/>`__ to download query results quickly, but at an increased cost. To use this API, first `enable it in the Cloud Console <https://console.cloud.google.com/apis/library/bigquerystorage.googleapis.com>`__. You must also have the `bigquery.readsessions.create <https://cloud.google.com/bigquery/docs/access-control#roles>`__ permission on the project you are billing queries to. This feature requires version 0.10.0 or later of the ``pandas-gbq`` package. It also requires the ``google-cloud-bigquery-storage`` and ``fastavro`` packages. .. versionadded:: 0.25.0 private_key : str, deprecated Deprecated in pandas-gbq version 0.8.0. Use the ``credentials`` parameter and :func:`google.oauth2.service_account.Credentials.from_service_account_info` or :func:`google.oauth2.service_account.Credentials.from_service_account_file` instead. Service account private key in JSON format. Can be file path or string contents. This is useful for remote server authentication (eg. Jupyter/IPython notebook on remote host). verbose : None, deprecated Deprecated in pandas-gbq version 0.4.0. Use the `logging module to adjust verbosity instead <https://pandas-gbq.readthedocs.io/en/latest/intro.html#logging>`__. Returns ------- df: DataFrame DataFrame representing results of query. See Also -------- pandas_gbq.read_gbq : This function in the pandas-gbq library. DataFrame.to_gbq : Write a DataFrame to Google BigQuery. """ pandas_gbq = _try_import() kwargs = {} # START: new kwargs. Don't populate unless explicitly set. if use_bqstorage_api is not None: kwargs["use_bqstorage_api"] = use_bqstorage_api # END: new kwargs # START: deprecated kwargs. Don't populate unless explicitly set. if verbose is not None: kwargs["verbose"] = verbose if private_key is not None: kwargs["private_key"] = private_key # END: deprecated kwargs return pandas_gbq.read_gbq( query, project_id=project_id, index_col=index_col, col_order=col_order, reauth=reauth, auth_local_webserver=auth_local_webserver, dialect=dialect, location=location, configuration=configuration, credentials=credentials, **kwargs)