import os os.environ["GOOGLE_APPLICATION_CREDENTIALS"] = "key.json" import plotly.graph_objects as go from plotly.offline import plot from bq_helper import BigQueryHelper client = BigQueryHelper('bigquery-public-data', 'covid19_italy') query = """ select * from bigquery-public-data.covid19_italy.data_by_region """ query_for_bar = """ select date, hospitalized_patients_symptoms from bigquery-public-data.covid19_italy.data_by_region where region_name="Toscana" """ query_for_pie = """ select region_name, SUM(hospitalized_patients_symptoms) AS sick from bigquery-public-data.covid19_italy.data_by_region group by region_name """ query_for_scatter = """ select date, AVG(total_hospitalized_patients) as hospitalized from bigquery-public-data.covid19_italy.data_by_region
import pandas as pd import numpy as pn import plotly.graph_objs as go import os from plotly.offline import plot from bq_helper import BigQueryHelper os.environ["GOOGLE_APPLICATION_CREDENTIALS"] = "keys.json" bq_assistant = BigQueryHelper('bigquery-public-data', 'san_francisco') QUERY = """ SELECT unique_key, descript, dayofweek, category FROM `bigquery-public-data.san_francisco.sfpd_incidents` LIMIT 100 """ df = bq_assistant.query_to_pandas(QUERY) print(df.head(4)) trace1 = go.Scatter( x=df['unique_key'].index, y=df['category'], mode="lines", name="category", ) layout = dict( title='', xaxis=dict(title='unique_key'), yaxis=dict(title='category'), ) fig = dict(data=[trace1], layout=layout)
# sample query from: # https://cloud.google.com/bigquery/public-data/openaq#which_10_locations_have_had_the_worst_air_quality_this_month_as_measured_by_high_pm10 QUERY = """ SELECT location, city, country, value, timestamp FROM `bigquery-public-data.openaq.global_air_quality` WHERE pollutant = "pm10" AND timestamp > "2017-04-01" ORDER BY value DESC LIMIT 1000 """ # The quick way to execute this query is to use [the bq_helper library](https://github.com/SohierDane/BigQuery_Helper): # In[7]: bq_assistant = BigQueryHelper('bigquery-public-data', 'openaq') df = bq_assistant.query_to_pandas(QUERY) df.head(3) # But what is bq_helper actually doing under the hood? Let's replicate the same process through the core BigQuery API to find out. # In[8]: client = bigquery.Client() query_job = client.query(QUERY) rows = list(query_job.result(timeout=30)) for row in rows[:3]: print(row) # The outputs look reasonable, but what's this storage format?
import pandas as pd from bq_helper import BigQueryHelper print("export GOOGLE_APPLICATION_CREDENTIALS=\"/Users/davidwu/Downloads/cs221-project-1b5850653193.json\"") bq_assistant = BigQueryHelper("bigquery-public-data", "github_repos") #all_tables = bq_assistant.list_tables() #for table in all_tables: # print(table) # print(str(bq_assistant.table_schema(table))) QUERY = """ SELECT * FROM `bigquery-public-data.github_repos.languages` """ #print(bq_assistant.estimate_query_size(QUERY)) print("HELLO") df = bq_assistant.query_to_pandas_safe(QUERY) print("HELLO") df.to_csv("all_project_languages.txt", sep='\t')
import os os.environ["GOOGLE_APPLICATION_CREDENTIALS"] = "keys.json" import pandas as pd from bq_helper import BigQueryHelper import plotly.graph_objs as go from plotly.offline import plot bq_assistant = BigQueryHelper('bigquery-public-data', 'epa_historical_air_quality') QUERY = """ SELECT `state_code`, `date_local`, `mdl`, `parameter_name` FROM `bigquery-public-data.epa_historical_air_quality.co_hourly_summary` LIMIT 1000 """ df = bq_assistant.query_to_pandas(QUERY) state_code_count = df.groupby(['state_code'])['parameter_name'].count() date_local_count = df.groupby(['date_local'])['parameter_name'].count() mdl_count = df.groupby(['mdl'])['parameter_name'].count() trace1 = go.Scatter(x=state_code_count.index, y=state_code_count.values) trace2 = go.Pie(labels=date_local_count.index, values=date_local_count.values) trace3 = go.Bar(x=mdl_count.index, y=mdl_count.values) layout1 = go.Layout(xaxis=dict(title='state_code'), yaxis=dict(title='parmater_name')) layout2 = go.Layout(title='data', xaxis=dict(title=''), yaxis=dict(title=''))
def GLquery(query="query1", country="US", start_year="1790", end_year="2019", min_count="100", kind_code="B2", cpc_code="A63F", assignee="President and Fellows of Harvard College", keyword="internet of things", budget=1000, output_to_csv=False, plot=False, ask_before_running=False): ''' ///////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// FUNCTION PARAMETERS GLquery takes the following arguments: - query (string): choose among one of the options in the next section (see below) (default is "query1") - country (string) = format "LL" (default is "US") - start_year (string) = format "YYYY" (default is "1790") - end_year (string) = format "YYYY" (default is "2019") - min_count (string) = minimum threshold (default is "100") - kind_code (string) = format "LN", see kind codes here: https://www.cas.org/support/documentation/references/patkind (default is "B2") - cpc_code (string) = formant "LNNL", see CPC codes here: https://www.uspto.gov/web/patents/classification/cpc/html/cpc.html (default is "A63F") - assignee (string) = any format (case sensitive) (default is "President and Fellows of Harvard College") - keyword (string) = any format (case sensitive) (default is "internet of things") - budget (number) = any number (default is 1.000 GB) (queries above value 'budget' will not run; queries below 'budget' will use only the minimum amount of memory necessary to run the query, not the full value of 'budget') - output_to_csv (True or False) = output results as .csv file to your current working directory (default is False) - plot (True of False) = plot results as .pdf file to your current working directory (default is False) (implemented for queries 1, 2, 3, 10) - ask_before_running (True or False) = given a query size (in GB), asks user input before running the query (default is False) /////////////////////////////////////////////////////////////////////////////// QUERIES As of June/2019, GLquery can perform the following queries: - query1: Number of patent applications by country (takes no inputs) - query2: Number of patents published between years X and Y by country (necessary arguments: start_year and end_year) - query3: Number of patents published to country Z between years X and Y (necessary arguments: start_year, end_year and country) - query4: Which patents country Z published between years X and Y? (necessary arguments: start_year, end_year, country, kind_code) - query5: Most common patenting technology areas by year (takes no inputs) - query6: Most common patenting technology areas in country Z between years X and Y (necessary arguments: start_year, end_year and country) - query7: Inventors with over N patents by country (necessary arguments: min_count) - query8: Patent landscaping of technology T between years X and Y" (necessary arguments: start_year, end_year, min_count and cpc_code) (this query returns patents, firms and countries associated with technology T) - query9: Which firms is assignee A citing in their patents? (necessary arguments: assignee) - query10: Number of patent applications with keyword K in country Z (necessary arguments: country and keyword) ///////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// ''' global my_output_path patents = bq_helper.BigQueryHelper(active_project="patents-public-data", dataset_name="patents") bq_assistant = BigQueryHelper("patents-public-data", "patents") print("Country selected: {}.".format(country)) print("Period selected: from {} to {}.".format(start_year, end_year)) def replace(string, substitutions): substrings = sorted(substitutions, key=len, reverse=True) regex = re.compile('|'.join(map(re.escape, substrings))) return regex.sub(lambda match: substitutions[match.group(0)], string) def query_selector(query): switcher = { "query1": """ -- "Number of publications by country" SELECT COUNT(*) AS cnt, country_code FROM ( SELECT ANY_VALUE(country_code) AS country_code FROM `patents-public-data.patents.publications` GROUP BY application_number) GROUP BY country_code ORDER BY cnt DESC """, "query2": replace( """ -- "Number of patents published between years X and Y by country" SELECT country_code, COUNT(DISTINCT publication_number) AS publications FROM `patents-public-data.patents.publications` WHERE publication_date >= XXXX0000 AND publication_date < YYYY0000 AND application_kind = 'A' GROUP BY country_code ORDER BY publications DESC """, { "XXXX": start_year, "YYYY": end_year }), "query3": replace( """ -- "Number of patents granted to country Z between years X and Y" SELECT FLOOR(publication_date/10000) as datum, COUNT(DISTINCT publication_number) as publications FROM `patents-public-data.patents.publications` WHERE country_code = 'ZZZZ' AND publication_date >= XXXX0000 AND publication_date <= YYYY0000 AND application_kind = 'A' GROUP BY datum, application_kind ORDER BY application_kind, datum """, { "XXXX": start_year, "YYYY": end_year, "ZZZZ": country }), "query4": replace( """ -- "Which patents country Z published between years X and Y" SELECT publication_number FROM `patents-public-data.patents.publications` WHERE country_code = 'ZZZZ' AND publication_date >= XXXX0000 AND publication_date <= YYYY0000 AND application_kind = 'A' AND kind_code = 'BBBB' """, { "XXXX": start_year, "YYYY": end_year, "ZZZZ": country, "BBBB": kind_code }), "query5": """ -- "Most common patenting technology areas by year" CREATE TEMPORARY FUNCTION highest_moving_avg(yearcnt ARRAY<STRUCT<filing_year INT64, cnt INT64>>) RETURNS STRUCT<filing_year INT64, avg INT64> LANGUAGE js AS \""" let avg = 0; let a = 1.0; let highest = {filing_year: -1, avg: -1}; for (let x of yearcnt) { avg = a * x.cnt + (1 - a) * avg; if (avg > highest.avg) { highest = {filing_year: x.filing_year, avg: avg};} } return highest; \"""; WITH patent_cpcs AS ( SELECT cd.parents, CAST(FLOOR(filing_date/10000) AS INT64) AS filing_year FROM ( SELECT ANY_VALUE(cpc) AS cpc, ANY_VALUE(filing_date) AS filing_date FROM `patents-public-data.patents.publications` WHERE application_number != "" GROUP BY application_number), UNNEST(cpc) AS cpcs JOIN `patents-public-data.cpc.definition` cd ON cd.symbol = cpcs.code WHERE cpcs.first = TRUE AND filing_date > 0) SELECT c.title_full, cpc_group, best_year.* FROM ( SELECT cpc_group, highest_moving_avg(ARRAY_AGG(STRUCT<filing_year INT64, cnt INT64>(filing_year, cnt) ORDER BY filing_year ASC)) AS best_year FROM ( SELECT cpc_group, filing_year, COUNT(*) AS cnt FROM ( SELECT cpc_parent AS cpc_group, filing_year FROM patent_cpcs, UNNEST(parents) AS cpc_parent) GROUP BY cpc_group, filing_year ORDER BY filing_year DESC, cnt DESC) GROUP BY cpc_group) JOIN `patents-public-data.cpc.definition` c ON cpc_group = c.symbol WHERE c.level = 5 ORDER BY best_year.filing_year ASC; """, "query6": replace( """ -- "Most common patenting technology areas in country Z between years X and Y" CREATE TEMPORARY FUNCTION highest_moving_avg(yearcnt ARRAY<STRUCT<filing_year INT64, cnt INT64>>) RETURNS STRUCT<filing_year INT64, avg INT64> LANGUAGE js AS \""" let avg = 0; let a = 1.0; let highest = {filing_year: -1, avg: -1}; for (let x of yearcnt) { avg = a * x.cnt + (1 - a) * avg; if (avg > highest.avg) { highest = {filing_year: x.filing_year, avg: avg};} } return highest; \"""; WITH patent_cpcs AS ( SELECT cd.parents, CAST(FLOOR(filing_date/10000) AS INT64) AS filing_year FROM ( SELECT ANY_VALUE(cpc) AS cpc, ANY_VALUE(filing_date) AS filing_date FROM `patents-public-data.patents.publications` WHERE application_number != "" AND country_code = 'ZZZZ' AND grant_date >= XXXX0000 AND grant_date <= YYYY0000 GROUP BY application_number), UNNEST(cpc) AS cpcs JOIN `patents-public-data.cpc.definition` cd ON cd.symbol = cpcs.code WHERE cpcs.first = TRUE AND filing_date > 0) SELECT c.title_full, cpc_group, best_year.* FROM ( SELECT cpc_group, highest_moving_avg(ARRAY_AGG(STRUCT<filing_year INT64, cnt INT64>(filing_year, cnt) ORDER BY filing_year ASC)) AS best_year FROM ( SELECT cpc_group, filing_year, COUNT(*) AS cnt FROM ( SELECT cpc_parent AS cpc_group, filing_year FROM patent_cpcs, UNNEST(parents) AS cpc_parent) GROUP BY cpc_group, filing_year ORDER BY filing_year DESC, cnt DESC) GROUP BY cpc_group) JOIN `patents-public-data.cpc.definition` c ON cpc_group = c.symbol WHERE c.level = 5 ORDER BY best_year.filing_year ASC; """, { "XXXX": start_year, "YYYY": end_year, "ZZZZ": country }), "query7": replace( """ -- "Inventors with over N patents by country" WITH temp1 AS ( SELECT DISTINCT PUB.country_code, PUB.application_number AS patent_number, inventor_name FROM `patents-public-data.patents.publications` PUB CROSS JOIN UNNEST(PUB.inventor) AS inventor_name WHERE PUB.grant_date >= 1790000 AND PUB.country_code IS NOT NULL AND PUB.application_number IS NOT NULL AND PUB.inventor IS NOT NULL) SELECT * FROM ( SELECT temp1.country_code AS country, temp1.inventor_name AS inventor, COUNT(temp1.patent_number) AS count_of_patents FROM temp1 GROUP BY temp1.country_code, temp1.inventor_name) WHERE count_of_patents >= NNNN """, {"NNNN": min_count}), "query8": replace( """ -- "Patent landscaping of technology T between years X and Y" SELECT SUM(year_cnt) AS total_count, assignee_name, ARRAY_AGG(STRUCT<cnt INT64, filing_year INT64, countries STRING>(year_cnt, filing_year, countries) ORDER BY year_cnt DESC LIMIT 1)[SAFE_ORDINAL(1)] AS Number_of_patents_under_this_CPC_code_Peak_year_Top_countries FROM ( SELECT SUM(year_country_cnt) AS year_cnt, assignee_name, filing_year, STRING_AGG(country_code ORDER BY year_country_cnt DESC LIMIT 5) AS countries FROM ( SELECT COUNT(*) AS year_country_cnt, a.name AS assignee_name, CAST(FLOOR(filing_date/10000) AS INT64) AS filing_year, apps.country_code FROM ( SELECT ANY_VALUE(assignee_harmonized) AS assignee_harmonized, ANY_VALUE(filing_date) AS filing_date, ANY_VALUE(country_code) AS country_code FROM `patents-public-data.patents.publications` AS pubs WHERE (SELECT MAX(TRUE) FROM UNNEST(pubs.cpc) AS c WHERE REGEXP_CONTAINS(c.code, "TTTT")) AND publication_date >= XXXX0000 AND publication_date <= YYYY0000 GROUP BY application_number) AS apps, UNNEST(assignee_harmonized) AS a WHERE filing_date > 0 GROUP BY a.name, filing_year, country_code) GROUP BY assignee_name, filing_year) GROUP BY assignee_name ORDER BY total_count DESC LIMIT NNNN """, { "XXXX": start_year, "YYYY": end_year, "NNNN": min_count, "TTTT": cpc_code }), "query9": replace( """ -- "Which firms is assignee A citing in their patents?" SELECT citing_assignee, COUNT(*) AS num_cites, citing_cpc_subclass, cpcdef.title_full AS citing_cpc_title FROM ( SELECT pubs.publication_number AS citing_publication_number, cite.publication_number AS cited_publication_number, citing_assignee_s.name AS citing_assignee, SUBSTR(cpcs.code, 0, 4) AS citing_cpc_subclass FROM `patents-public-data.patents.publications` AS pubs, UNNEST(citation) AS cite, UNNEST(assignee_harmonized) AS citing_assignee_s, UNNEST(cpc) AS cpcs WHERE cpcs.first = TRUE) AS pubs JOIN ( SELECT publication_number AS cited_publication_number, cited_assignee_s.name AS cited_assignee FROM `patents-public-data.patents.publications`, UNNEST(assignee_harmonized) AS cited_assignee_s) AS refs ON pubs.cited_publication_number = refs.cited_publication_number JOIN `patents-public-data.cpc.definition` AS cpcdef ON cpcdef.symbol = citing_cpc_subclass WHERE cited_assignee = "AAAA" AND citing_assignee != "AAAA" GROUP BY cited_assignee, citing_assignee, citing_cpc_subclass, cpcdef.title_full ORDER BY num_cites DESC """, {"AAAA": assignee}), "query10": replace( """ -- Number of patent applications with keyword K in country Z WITH Patent_Matches AS ( SELECT PARSE_DATE('%Y%m%d', SAFE_CAST(ANY_VALUE(patentsdb.filing_date) AS STRING)) AS Patent_Filing_Date, patentsdb.application_number AS Patent_Application_Number, ANY_VALUE(abstract_info.text) AS Patent_Title, ANY_VALUE(abstract_info.language) AS Patent_Title_Language FROM `patents-public-data.patents.publications` AS patentsdb, UNNEST(abstract_localized) AS abstract_info WHERE LOWER(abstract_info.text) LIKE '%KKKK%' AND patentsdb.country_code = 'ZZZZ' GROUP BY Patent_Application_Number), Date_Series_Table AS ( SELECT day, 0 AS Number_of_Patents FROM UNNEST (GENERATE_DATE_ARRAY( (SELECT MIN(Patent_Filing_Date) FROM Patent_Matches), (SELECT MAX(Patent_Filing_Date) FROM Patent_Matches))) AS day) SELECT SAFE_CAST(FORMAT_DATE('%Y-%m',Date_Series_Table.day) AS STRING) AS Patent_Date_YearMonth, COUNT(Patent_Matches.Patent_Application_Number) AS Number_of_Patent_Applications FROM Patent_Matches RIGHT JOIN Date_Series_Table ON Patent_Matches.Patent_Filing_Date = Date_Series_Table.day GROUP BY Patent_Date_YearMonth ORDER BY Patent_Date_YearMonth """, { "ZZZZ": country, "KKKK": keyword }), } return switcher.get(query, "Invalid query") print("Estimated query size: {} GB.".format( bq_assistant.estimate_query_size(query_selector(query)))) def kenvelo(question, answer="no"): range_of_choices = { "yes": True, "y": True, "": True, "no": False, "n": False } if answer is None: prompt = "[Y/N]" elif answer == "yes": prompt = "[Y/N]" elif answer == "no": prompt = "[Y/N]" else: raise ValueError("Answer '%s' is invalid.") while True: sys.stdout.write(question + prompt) choice = input().lower() if answer is not None and choice == '': return range_of_choices[answer] elif choice in range_of_choices: return range_of_choices[choice] else: sys.stdout.write( "Please answer any variation of 'yes' or 'no'. ") while ask_before_running is False or kenvelo("Run query? "): def plot_query1(): figure(num=None, figsize=(24, 16), facecolor='w', edgecolor='k') sns.set(context='paper', style='ticks', font_scale=0.9) sns.barplot(x='country_code', y='cnt', data=patents.query_to_pandas_safe( query_selector("query1"), max_gb_scanned=bq_assistant.estimate_query_size( query_selector("query1")))) plt.title("Number of publications by country", loc='left', fontsize=24, style='oblique') plt.ylabel('# of publications (log)', fontsize=14) plt.xlabel('Country', fontsize=14) plt.yscale('log') sns.despine(offset=10, trim=True) plt.savefig(my_output_path + "query1" + '.pdf', orientation='landscape', bbox_inches='tight') plt.show() def plot_query2(): figure(num=None, figsize=(24, 16), facecolor='w', edgecolor='k') sns.set(context='paper', style='ticks', font_scale=0.9) sns.barplot(x='country_code', y='publications', data=patents.query_to_pandas_safe( query_selector("query2"), max_gb_scanned=bq_assistant.estimate_query_size( query_selector("query2")))) plt.title( "Number of patents published between years {} and {} by country" .format(start_year, end_year), loc='left', fontsize=24, style='oblique') plt.ylabel('# of publications') plt.xlabel('Country') plt.yscale('log') sns.despine(offset=10, trim=True) plt.savefig(my_output_path + "query2" + '.pdf', orientation='landscape', bbox_inches='tight') plt.show() def plot_query3(): figure(num=None, figsize=(24, 16), facecolor='w', edgecolor='k') sns.set(context='paper', style='ticks', font_scale=0.9) sns.barplot(x='datum', y='publications', data=patents.query_to_pandas_safe( query_selector("query3"), max_gb_scanned=bq_assistant.estimate_query_size( query_selector("query3")))) plt.title( "Number of patents granted to country {} between years {} and {}" .format(country, start_year, end_year), loc='left', fontsize=24, style='oblique') plt.ylabel('# of patents', fontsize=14) plt.xlabel('') sns.despine(offset=10, trim=True) plt.savefig(my_output_path + "query3" + '.pdf', orientation='landscape', bbox_inches='tight') plt.show() def plot_query10(): figure(num=None, figsize=(24, 16), facecolor='w', edgecolor='k') sns.set(context='paper', style='ticks', font_scale=0.9) sns.barplot(x='Patent_Date_YearMonth', y='Number_of_Patent_Applications', data=patents.query_to_pandas_safe( query_selector("query10"), max_gb_scanned=bq_assistant.estimate_query_size( query_selector("query10")))) plt.title( "Number of patent applications of technology {} in country {}". format(keyword, country), loc='left', fontsize=24, style='oblique') plt.ylabel('# of applications', fontsize=14) plt.xlabel('Date', fontsize=14, rotation=45) sns.despine(offset=10, trim=True) plt.savefig(my_output_path + "query10" + '.pdf', orientation='landscape', bbox_inches='tight') plt.show() def plotter(query): if query == "query1": plot_query1() query = "query2" elif query == "query2": plot_query2() query = "query3" elif query == "query3": plot_query3() query = "query10" elif query == "query10": plot_query10() else: print('\033[1m' + "Sorry, this query doesn't output graphs (yet)." + '\033[0m') temp_path = Path(my_output_path) if output_to_csv and plot: return patents.query_to_pandas_safe( query_selector(query), max_gb_scanned=min( budget, bq_assistant.estimate_query_size( query_selector(query)))).to_csv( Path(temp_path, query + '.csv'), index=False, encoding="utf-8"), plotter(query) if output_to_csv: return patents.query_to_pandas_safe( query_selector(query), max_gb_scanned=min( budget, bq_assistant.estimate_query_size( query_selector(query)))).to_csv(Path( temp_path, query + '.csv'), index=False, encoding="utf-8") if plot: return plotter(query) else: return patents.query_to_pandas_safe( query_selector(query), max_gb_scanned=min( budget, bq_assistant.estimate_query_size(query_selector(query)))) if kenvelo("You will use", bq_assistant.estimate_query_size(query_selector(query)), "GB in this query. Continue? ") is False: break
#!/usr/bin/env python # coding: utf-8 # In[2]: import bq_helper from bq_helper import BigQueryHelper import os os.environ["GOOGLE_APPLICATION_CREDENTIALS"] = "Cloud Final-1f49f4274b69.json" bq_assistant = BigQueryHelper("bigquery-public-data", "stackoverflow") QUERY_1 = "SELECT ID,TITLE FROM `bigquery-public-data.stackoverflow.posts_questions` WHERE ID = 57804" QUERY = "SELECT q.id, q.title, q.body, q.tags, a.body as answers, a.score FROM `bigquery-public-data.stackoverflow.posts_questions` AS q INNER JOIN `bigquery-public-data.stackoverflow.posts_answers` AS a ON q.id = a.parent_id WHERE q.tags LIKE '%python%' LIMIT 10000" df = bq_assistant.query_to_pandas(QUERY) # In[3]: import findspark findspark.init() import pyspark # only run after findspark.init() from pyspark.sql import SparkSession spark = SparkSession.builder.getOrCreate() # In[5]: file_save_path = os.getcwd() + "/dataset/big_query.parquet" data_frame = spark.createDataFrame(df)
from bq_helper import BigQueryHelper from google.cloud import bigquery #before executing this code you have to export your key : export GOOGLE_APPLICATION_CREDENTIALS="path to your key.json" client = bigquery.Client(project='Oceans') bq_assistant = BigQueryHelper(active_project="bigquery-public-data", dataset_name="noaa_icoads") bq_assistant.list_tables() bq_assistant.table_schema('icoads_core_2005') query = """SELECT latitude, longitude, wind_direction_true, wind_speed, present_weather, sea_level_pressure, air_temperature, wetbulb_temperature, sea_surface_temp, total_cloud_amount, cloud_height, wave_direction, wave_period, wave_height, swell_direction, swell_period, swell_height, timestamp FROM `bigquery-public-data.noaa_icoads.icoads_core_2005` """ #execute the query. res = bq_assistant.query_to_pandas_safe(query) #save the result. res.to_csv("resultat_total_20052.csv", sep=',')
import os os.environ["GOOGLE_APPLICATION_CREDENTIALS"] = "key.json" from bq_helper import BigQueryHelper client = BigQueryHelper('bigquery-public-data', 'stackoverflow') QUERY = """ SELECT * FROM `bigquery-public-data.stackoverflow.comments` """ print(client.estimate_query_size(QUERY)) #13.3 GB bq_assistant = BigQueryHelper('bigquery-public-data', 'new_york') QUERY = """ SELECT * FROM `bigquery-public-data.new_york.citibike_trips` """ print(bq_assistant.estimate_query_size(QUERY)) #4.6 GB
import os os.environ["GOOGLE_APPLICATION_CREDENTIALS"]="keys.json" import pandas as pd from bq_helper import BigQueryHelper import plotly.graph_objs as go from plotly.offline import plot bq_assistant = BigQueryHelper('bigquery-public-data','hap_hourly_summary') QUERY = """ SELECT 'country-name','date-local','sample-measurement', 'method-code' FROM 'bigquery-public-data.epa_historical_air_quality.epa_hap_hourly_summary' LIMIT 100 """ df = bq_assistant.query_to_pandas(QUERY) trace1 = go.Scatter( x = df_observation_.date_local, y = df_observation_state.sample_measurement, mode = 'lines+markers', name = 'Observation percent depending on state'
# This Python 3 environment comes with many helpful analytics libraries installed # It is defined by the kaggle/python docker image: https://github.com/kaggle/docker-python # For example, here's several helpful packages to load in import numpy as np # linear algebra import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv) from google.cloud import bigquery import bq_helper from bq_helper import BigQueryHelper stackOverflow = bq_helper.BigQueryHelper(active_project="bigquery-public-data", dataset_name="stackoverflow") bq_assistant = BigQueryHelper("bigquery-public-data", "stackoverflow") bq_assistant.list_tables() tables = [ 'badges', 'comments', 'post_history', 'post_links', 'posts_answers', 'posts_moderator_nomination', 'posts_orphaned_tag_wiki', 'posts_privilege_wiki', 'posts_questions', 'posts_tag_wiki', 'posts_tag_wiki_excerpt', 'posts_wiki_placeholder', 'stackoverflow_posts', 'tags', 'users', 'votes' ] for t in tables: tag = bq_assistant.head(t, num_rows=5000) tag.to_csv('stackoverflow_' + t + '.csv')
import numpy as np # linear algebra import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv) from google.cloud import bigquery from bq_helper import BigQueryHelper # Input data files are available in the "../input/" directory. # For example, running this (by clicking run or pressing Shift+Enter) will list the files in the input directory # import os # print(os.listdir("../input")) # Any results you write to the current directory are saved as output. #some queries to extract the data from the hacker news dataset #I end up getting chunks from the tables, as chunks >1gig tend to fail even if I can theoretically get upto 5gig bq_assistant = BigQueryHelper("bigquery-public-data", "hacker_news") #pandas_client = bigquery.Client() #bq_assistant.list_tables() #Gets the 'stories' table into a csv file #QUERY1 = """ # SELECT # * # FROM # `bigquery-public-data.hacker_news.stories` # """ #bq_assistant.estimate_query_size(QUERY1) #df = bq_assistant.query_to_pandas_safe(QUERY) #df.to_csv('stories_test.csv', index = False) #Gets the 'comments' table into a csv file
query = """ SELECT block_timestamp, from_address, to_address, value, receipt_contract_address, receipt_status, block_number FROM `bigquery-public-data.ethereum_blockchain.transactions` AS transactions WHERE TRUE AND block_timestamp < '2017-01-01 00:00:00' """ # This establishes an authenticated session and prepares a reference to the dataset that lives in BigQuery. bq_assistant = BigQueryHelper("bigquery-public-data", "ethereum_blockchain") print("estimated query data size: " + str(bq_assistant.estimate_query_size(query))) df = bq_assistant.query_to_pandas_safe(query, max_gb_scanned=60) df.to_csv( "/home/yueduan/yueduan/postdoc_study/ether_transactions_before2017.csv") # # Import dataset from big query # from google.cloud import bigquery # from bq_helper import BigQueryHelper # import matplotlib.pyplot as plt # import seaborn as sns # import pandas as pd # #%matplotlib inline
import numpy as np import pandas as pd import matplotlib.pyplot as plt import seaborn as sns import os from google.cloud import bigquery from bq_helper import BigQueryHelper bq_assist = BigQueryHelper(active_project='bigquery-public-data', dataset_name='epa_historical_air_quality') bq_assist.list_tables() bq_assist.head('temperature_daily_summary') query = """ SELECT EXTRACT (YEAR FROM date_local) AS Year, AVG ((arithmetic_mean - 32.0)/ 1.80) AS avg_temp_celcius FROM `bigquery-public-data.epa_historical_air_quality.temperature_daily_summary` GROUP BY Year Order BY Year """ avg_temp = bq_assist.query_to_pandas_safe(query, max_gb_scanned=10) query = """ SELECT EXTRACT (YEAR FROM date_local) AS Year, AVG (arithmetic_mean) AS avg_co FROM `bigquery-public-data.epa_historical_air_quality.co_daily_summary` GROUP BY Year Order BY Year """ avg_co = bq_assist.query_to_pandas_safe(query, max_gb_scanned=10)
import os os.environ["GOOGLE_APPLICATION_CREDENTIALS"] = "keys.json" import pandas as pd from bq_helper import BigQueryHelper bq_assistant = BigQueryHelper('bigquery-public-data', 'new_york') QUERY = """ SELECT * FROM `bigquery-public-data.new_york.citibike_trips` """ print(bq_assistant.estimate_query_size(QUERY)) #4.5 GB QUERY = """ SELECT * FROM `bigquery-public-data.new_york.citibike_trips` LIMIT 100 """ df = bq_assistant.query_to_pandas(QUERY) print(df.head(10))
import bq_helper from bq_helper import BigQueryHelper stackOverflow = bq_helper.BigQueryHelper(active_project="bigquery-public-data", dataset_name="stackoverflow") bq_assistant = BigQueryHelper("bigquery-public-data", "stackoverflow") tabelas = bq_assistant.list_tables() esquemas_tabelas = {} query = """ SELECT Year, Tag, Total, Percent_Questions_with_Answers FROM (SELECT EXTRACT(YEAR FROM a.creation_date) as Year, t.tag_name as Tag, COUNT(1) as Total, ROUND(100 * SUM(IF(a.answer_count > 0, 1, 0)) / COUNT(*), 1) AS Percent_Questions_with_Answers FROM `bigquery-public-data.stackoverflow.posts_questions` a right JOIN `bigquery-public-data.stackoverflow.tags` t ON t.tag_name in UNNEST(SPLIT(a.tags,'|')) GROUP BY Year, Tag HAVING Year > 2019 AND Year < 2021 ORDER BY Total DESC LIMIT 20) ORDER BY Percent_Questions_with_Answers DESC """ response = stackOverflow.query_to_pandas_safe(query, max_gb_scanned=20) response.head(20) query1 = "SELECT tag_name as Assunto, count as Num_perguntas FROM `bigquery-public-data.stackoverflow.tags` order BY count DESC" response1 = stackOverflow.query_to_pandas_safe(query1, max_gb_scanned=20) response1.head(20) query2 = """SELECT Day_of_Week, COUNT(1) AS Num_Questions, SUM(answered_in_1h) AS Num_Answered_in_1H,
def setUp(self): self.my_bq = BigQueryHelper("bigquery-public-data", "openaq") self.query = "SELECT location FROM `bigquery-public-data.openaq.global_air_quality`" # Query randomized so it won't hit the cache across multiple test runs self.randomizable_query = """
class TestBQHelper(unittest.TestCase): def setUp(self): self.my_bq = BigQueryHelper("bigquery-public-data", "openaq") self.query = "SELECT location FROM `bigquery-public-data.openaq.global_air_quality`" # Query randomized so it won't hit the cache across multiple test runs self.randomizable_query = """ SELECT value FROM `bigquery-public-data.openaq.global_air_quality` WHERE value = {0}""" def test_list_tables(self): self.assertEqual(self.my_bq.list_tables(), ['global_air_quality']) def test_list_schema(self): self.assertEqual(len(self.my_bq.table_schema('global_air_quality')), 11) def test_estimate_query_size(self): self.assertIsInstance(self.my_bq.estimate_query_size(self.query), float) def test_query_to_pandas(self): self.assertIsInstance(self.my_bq.query_to_pandas(self.query), DataFrame) def test_query_safe_passes(self): self.assertIsInstance(self.my_bq.query_to_pandas_safe(self.query), DataFrame) def test_query_safe_fails(self): # Different query must be used for this test to ensure we don't hit the # cache and end up passing by testing a query that would use zero bytes. fail_query = self.randomizable_query.format(random()) self.assertIsNone(self.my_bq.query_to_pandas_safe(fail_query, 10**-10)) def test_head(self): self.assertIsInstance(self.my_bq.head('global_air_quality'), DataFrame) def test_useage_tracker(self): self.my_bq.query_to_pandas(self.randomizable_query.format(random())) self.assertNotEqual(self.my_bq.total_gb_used_net_cache, 0) def test_bad_query_raises_right_error(self): with self.assertRaises(BadRequest): self.my_bq.query_to_pandas("Not a valid query") def test_list_nested_schema(self): nested_helper = BigQueryHelper("bigquery-public-data", "github_repos") self.assertEqual(len(nested_helper.table_schema('commits')), 33)
display(f, df) # ### Images # Let's use the data to annotate some train images and see what we're trying to do. You'll see there is some complexity here, both with the labels and the boxes. # In[ ]: train = pd.read_csv('../input/train_human_labels.csv', usecols=['ImageID', 'LabelName']) descrips = pd.read_csv('../input/class-descriptions.csv', names=['LabelName', 'Description']) train = train.merge(descrips, how='left', on='LabelName') train.head(9) open_images = BigQueryHelper(active_project="bigquery-public-data", dataset_name="open_images") query = """ SELECT image_id, original_url FROM `bigquery-public-data.open_images.images` WHERE image_id IN UNNEST(['0199bc3e1db115d0', '4fa8054781a4c382', '51c5d8d5d9cd87ca', '9ec02b5c0315fcd1', 'b37f763ae67d0888', 'ddcb4b7478e9917b']) """ urls = open_images.query_to_pandas_safe(query) boxes = pd.read_csv('../input/train_bounding_boxes.csv') boxes = boxes[boxes.ImageID.isin( urls.image_id.tolist())].sort_values('ImageID')
def test_list_nested_schema(self): nested_helper = BigQueryHelper("bigquery-public-data", "github_repos") self.assertEqual(len(nested_helper.table_schema('commits')), 33)
import os os.environ["GOOGLE_APPLICATION_CREDENTIALS"]="keys.json" import pandas as pd from bq_helper import BigQueryHelper import plotly.graph_objs as go from plotly.offline import plot bq_assistant = BigQueryHelper('epa_historical_air_quality' ,'bigquery-public-data') QUERY = """ SELECT state_code, sample_duration, date_local, units_of_measure FROM bigquery-public-data.epa_historical_air_quality.lead_daily_summary LIMIT 100 """ _sample_duration_count = df.groupby(['date_local'])['sample_duration'].count() _units_of_measure_count = df.groupby(['date_local'])['units_of_measure'].count() _code_count = df.groupby(['date_local'])['code_count'].count() trace1 = go.Bar( x=_sample_duration_count.index, y=_sample_duration_count.values ) trace2 = go.Pie( x=_units_of_measure_count.index, y=_units_of_measure_count.values ) trace3 = go.Scatter( x=_code_count.index, y=_code_count.values, mode='lines'
# https://github.com/SohierDane/BigQuery_Helper from bq_helper import BigQueryHelper bq_assistant = BigQueryHelper("bigquery-public-data", "github_repos") QUERY = """ SELECT message FROM `bigquery-public-data.github_repos.commits` WHERE LENGTH(message) > 6 AND LENGTH(message) <= 20 LIMIT 2000 """ bq_assistant.estimate_query_size(QUERY) bq_assistant.listtables() # Any results you write to the current directory are saved as output.
import os os.environ["GOOGLE_APPLICATION_CREDENTIALS"] = "key.json" import pandas as pd from bq_helper import BigQueryHelper client = BigQueryHelper('bigquery-public-data', 'stackoverflow') query = "SELECT " \ "title, answer_count " \ "FROM " \ "bigquery-public-data.stackoverflow.stackoverflow_posts " \ "ORDER BY " \ "answer_count DESC " \ "LIMIT 2" df = client.query_to_pandas(query) print(df)
import seaborn as sns plt.style.use('fivethirtyeight') # import plotly import plotly import plotly.offline as py py.init_notebook_mode(connected=True) import plotly.tools as tls import plotly.graph_objs as go import plotly.tools as tls import plotly.figure_factory as fig_fact plotly.tools.set_config_file(world_readable=True, sharing='public') bq_assistant = BigQueryHelper("bigquery-public-data", "openaq") get_ipython().run_line_magic('matplotlib', 'inline') # # TL;DR # > A country plot of air pollution. Hover over the desire location, it will show you average value of some dangerous elements # In[ ]: QUERY = """ SELECT country, avg(value) as o3_avg_value FROM `bigquery-public-data.openaq.global_air_quality` WHERE
import os os.environ["GOOGLE_APPLICATION_CREDENTIALS"] = "keys.json" import pandas from bq_helper import BigQueryHelper import plotly.graph_objs as go from plotly.offline import plot bq_assistant = BigQueryHelper("bigquery-public-data", "san_francisco") QUERY = """ SELECT `trip_id`, `duration_sec`, `start_station_name`, `subscriber_type` FROM `bigquery-public-data.san_francisco.bikeshare_trips` LIMIT 15 """ df = bq_assistant.query_to_pandas(QUERY) trace1 = go.Scatter(x=df.trip_id.index, y=df.duration_sec.values, mode="lines", name="trip duration by trip") layoutScatter = dict(title="placeholder", xaxis=dict(title="tripnumber"), yaxis=dict(title="tripduration")) tracescatter = dict(data=[trace1], layout=layoutScatter) plot(tracescatter) numberofcustomers = df.groupby(["subscriber_type"])["trip_id"].count()
import os os.environ["GOOGLE_APPLICATION_CREDENTIALS"] = "keys.json" import pandas as pd from bq_helper import BigQueryHelper import plotly.graph_objs as go from plotly.offline import plot bq_assistant = BigQueryHelper('bigquerry-public-data', 'pm10_daily_summary') QUERY = """ SELECT county_code, sample_duration, date_local, observation_count FROM `bigquerry-public-data._pm10_daily_summary` LIMIT 100 """ df = bq_assistant.query_to_pandas(QUERY) local_count_1 = df.grouphy(['county_code'])['observation_count'].count() local_count_2 = df.grouphy(['county_code'])['observation_count'].count() local_count_3 = df.grouphy(['county_code'])['observation_count'].count() trace3 = go.Scatter(x=local_count_3.index, y=local_count_3.value, marker=dict(color='rgba(225, 174, 255, 0.5)')) trace2 = go.Scatter(x=local_count_2.index, y=local_count_2.value, marker=dict(color='rgba(225, 174, 255, 0.5)')) trace1 = go.Bar(x=local_count_1.index, y=local_count_1.value, marker=dict(color='rgba(225, 174, 255, 0.5)'))
import os os.environ["GOOGLE_APPLICATION_CREDENTIALS"] = "key.json" from bq_helper import BigQueryHelper client = BigQueryHelper('bigquery-public-data','usa-names') import plotly.graph_objects as go from plotly.offline import plot query = "select name, number from bigquery-public-data.usa_names.usa_1910_current limit 150" df = client.query_to_pandas(query) x = df['name'] y= df['number'] name =go.Scatter(x=x , y=y, mode="lines") layout = go.Layout(title='Scatter' ) fig = go.Figure(data=[name], layout = layout) plot(fig)
def main(): path = '../data/accidents' bq_assistant = BigQueryHelper('bigquery-public-data', 'nhtsa_traffic_fatalities') QUERY = """ SELECT a.state_number, a.consecutive_number, veh.vehicle_number, state_name, a.hour_of_crash AS hour_of_day, CASE WHEN day_of_week = 1 THEN 'Sunday' WHEN day_of_week = 2 THEN 'Monday' WHEN day_of_week = 3 THEN 'Tuesday' WHEN day_of_week = 4 THEN 'Wednesday' WHEN day_of_week = 5 THEN 'Thursday' WHEN day_of_week = 6 THEN 'Friday' WHEN day_of_week = 7 THEN 'Saturday' END day_of_week, CASE WHEN EXTRACT(MONTH FROM a.timestamp_of_crash) = 1 THEN 'January' WHEN EXTRACT(MONTH FROM a.timestamp_of_crash) = 2 THEN 'February' WHEN EXTRACT(MONTH FROM a.timestamp_of_crash) = 3 THEN 'March' WHEN EXTRACT(MONTH FROM a.timestamp_of_crash) = 4 THEN 'April' WHEN EXTRACT(MONTH FROM a.timestamp_of_crash) = 5 THEN 'May' WHEN EXTRACT(MONTH FROM a.timestamp_of_crash) = 6 THEN 'June' WHEN EXTRACT(MONTH FROM a.timestamp_of_crash) = 7 THEN 'July' WHEN EXTRACT(MONTH FROM a.timestamp_of_crash) = 8 THEN 'August' WHEN EXTRACT(MONTH FROM a.timestamp_of_crash) = 9 THEN 'September' WHEN EXTRACT(MONTH FROM a.timestamp_of_crash) = 10 THEN 'October' WHEN EXTRACT(MONTH FROM a.timestamp_of_crash) = 11 THEN 'November' WHEN EXTRACT(MONTH FROM a.timestamp_of_crash) = 12 THEN 'December' END month, EXTRACT(YEAR FROM a.timestamp_of_crash) AS year, land_use_name AS land_use, national_highway_system, route_signing_name AS roadway_type, type_of_intersection AS intersection, light_condition_name AS light_condition, atmospheric_conditions_1_name AS atmospheric_conditions, latitude, longitude, manner_of_collision_name AS manner_of_collision, number_of_vehicle_forms_submitted_all AS num_vehicles, number_of_persons_not_in_motor_vehicles_in_transport_mvit AS num_nonmotorists, number_of_persons_in_motor_vehicles_in_transport_mvit AS num_motorists, number_of_fatalities AS num_fatalities, number_of_drunk_drivers AS num_drunk_drivers, body_type_name AS body_type, vehicle_model_year AS vehicle_year, rollover, contributing_circumstances_motor_vehicle_name AS vehicle_conditions, previous_dwi_convictions, previous_speeding_convictions, speeding_related, speed_limit, drivers_vision_obscured_by_name AS driver_vision_obscured, FROM `bigquery-public-data.nhtsa_traffic_fatalities.accident_{0}` AS a JOIN `bigquery-public-data.nhtsa_traffic_fatalities.vehicle_{0}` AS veh ON a.consecutive_number=veh.consecutive_number JOIN `bigquery-public-data.nhtsa_traffic_fatalities.factor_{0}` AS f ON a.consecutive_number=f.consecutive_number AND veh.vehicle_number=f.vehicle_number JOIN `bigquery-public-data.nhtsa_traffic_fatalities.vision_{0}` AS v ON a.consecutive_number=v.consecutive_number AND veh.vehicle_number=v.vehicle_number ORDER BY consecutive_number, vehicle_number """ data_2015 = bq_assistant.query_to_pandas(QUERY.format(2015)) data_2016 = bq_assistant.query_to_pandas(QUERY.format(2016)) data_2015.to_csv(f'{path}/accident_driver_data_2015.csv', index=False) data_2016.to_csv(f'{path}/accident_driver_data_2016.csv', index=False)
def create_github_bq_client(): """Create the object for BigQueryHelper""" gh_archive = BigQueryHelper(active_project="githubarchive", dataset_name="day") _logger.info('Setting up BQ Client') return gh_archive
# # NOTES: # - Need a Google Cloud authentication: # https://cloud.google.com/docs/authentication/getting-started#auth-cloud-implicit-python # - The path to the json auth file you get from the google auth needs to be # set in your PATH. # - This means you're going to have to set it everytime you have a new # shell session. # - bq_helper abstracts retrieving big queries through google cloud for # an easier time: # https://github.com/SohierDane/BigQuery_Helper from bq_helper import BigQueryHelper import pandas as pd print('Retrieving NOAA GSOD data.') bq_assist = BigQueryHelper("bigquery-public-data", "noaa_gsod") print('Successfully retrieved data.') start_yr = 2010 end_yr = 2019 for year in range(start_yr, end_yr): query = "SELECT stn, year, mo AS month, da AS day, temp, slp FROM `bigquery-public-data.noaa_gsod.gsod{}` WHERE wban='23174' OR wban='23129' OR wban='93134' ".format(year) df_weather = bq_assist.query_to_pandas_safe(query, max_gb_scanned=5) filename = 'socal_weather_{}.csv'.format(year) df_weather.to_csv(filename, index=False) print("Saved {}".format(filename))