class TestBQHelper(unittest.TestCase): def setUp(self): self.my_bq = BigQueryHelper("bigquery-public-data", "openaq") self.query = "SELECT location FROM `bigquery-public-data.openaq.global_air_quality`" # Query randomized so it won't hit the cache across multiple test runs self.randomizable_query = """ SELECT value FROM `bigquery-public-data.openaq.global_air_quality` WHERE value = {0}""" def test_list_tables(self): self.assertEqual(self.my_bq.list_tables(), ['global_air_quality']) def test_list_schema(self): self.assertEqual(len(self.my_bq.table_schema('global_air_quality')), 11) def test_estimate_query_size(self): self.assertIsInstance(self.my_bq.estimate_query_size(self.query), float) def test_query_to_pandas(self): self.assertIsInstance(self.my_bq.query_to_pandas(self.query), DataFrame) def test_query_safe_passes(self): self.assertIsInstance(self.my_bq.query_to_pandas_safe(self.query), DataFrame) def test_query_safe_fails(self): # Different query must be used for this test to ensure we don't hit the # cache and end up passing by testing a query that would use zero bytes. fail_query = self.randomizable_query.format(random()) self.assertIsNone(self.my_bq.query_to_pandas_safe(fail_query, 10**-10)) def test_head(self): self.assertIsInstance(self.my_bq.head('global_air_quality'), DataFrame) def test_useage_tracker(self): self.my_bq.query_to_pandas(self.randomizable_query.format(random())) self.assertNotEqual(self.my_bq.total_gb_used_net_cache, 0) def test_bad_query_raises_right_error(self): with self.assertRaises(BadRequest): self.my_bq.query_to_pandas("Not a valid query") def test_list_nested_schema(self): nested_helper = BigQueryHelper("bigquery-public-data", "github_repos") self.assertEqual(len(nested_helper.table_schema('commits')), 33)
import pandas as pd # https://github.com/SohierDane/BigQuery_Helper from bq_helper import BigQueryHelper bq_assistant = BigQueryHelper("bigquery-public-data", "github_repos") QUERY = """ SELECT * FROM `bigquery-public-data.github_repos.languages` """ df = bq_assistant.query_to_pandas_safe(QUERY) def proc_language(row): return list(map(lambda x: x['name'], row['language'])) df['language'] = df.apply(proc_language, axis=1) df.to_csv('language_proccessed.csv') from ast import literal_eval count = {} byte_count = {} def stats(row): global count global byte_count for lang in literal_eval(row['language']):
train = train.merge(descrips, how='left', on='LabelName') train.head(9) open_images = BigQueryHelper(active_project="bigquery-public-data", dataset_name="open_images") query = """ SELECT image_id, original_url FROM `bigquery-public-data.open_images.images` WHERE image_id IN UNNEST(['0199bc3e1db115d0', '4fa8054781a4c382', '51c5d8d5d9cd87ca', '9ec02b5c0315fcd1', 'b37f763ae67d0888', 'ddcb4b7478e9917b']) """ urls = open_images.query_to_pandas_safe(query) boxes = pd.read_csv('../input/train_bounding_boxes.csv') boxes = boxes[boxes.ImageID.isin( urls.image_id.tolist())].sort_values('ImageID') # In[ ]: imlist = urls.image_id.tolist() files = ['../input/openimages-support/ims/{}.jpg'.format(i) for i in imlist] fig, ax = plt.subplots() fig.set_size_inches((15, 15)) ax.set_axis_off() for n, (file, image) in enumerate(zip(files, imlist)): a = fig.add_subplot(2, 3, n + 1) req = request.urlopen(urls.original_url[n])
# Pulls data from the BigQuery 4TB dataset of GitHub at https://www.kaggle.com/github/github-repos # * Create a kernel at https://www.kaggle.com/mrisdal/safely-analyzing-github-projects-popular-licenses # ! Don't run this code here! Do it at the kernel! import pandas as pd from bq_helper import BigQueryHelper bq_assistant = BigQueryHelper("bigquery-public-data", "github_repos") QUERY = """ SELECT author, subject, repo_name FROM `bigquery-public-data.github_repos.commits` LIMIT 250000 """ print(f"Using {bq_assistant.estimate_query_size(QUERY)} GB / 5 TB Limit") df = bq_assistant.query_to_pandas_safe(QUERY, max_gb_scanned=125) print(df.head()) print( f'Size of dataframe: {int(df.memory_usage(index=True, deep=True).sum()) / 2**20} MB' ) df.to_csv('commits.csv')
QUERY = """ SELECT country, avg(value) as o3_avg_value FROM `bigquery-public-data.openaq.global_air_quality` WHERE pollutant = 'o3' AND country != 'NL' AND unit = 'µg/m³' GROUP BY country ORDER BY o3_avg_value ASC """ df_all = bq_assistant.query_to_pandas_safe(QUERY) # In[ ]: QUERY = """ SELECT country, avg(value) as no2_avg_value FROM `bigquery-public-data.openaq.global_air_quality` WHERE pollutant = 'no2' AND unit = 'µg/m³' GROUP BY country ORDER BY no2_avg_value ASC """
from bq_helper import BigQueryHelper from google.cloud import bigquery #before executing this code you have to export your key : export GOOGLE_APPLICATION_CREDENTIALS="path to your key.json" client = bigquery.Client(project='Oceans') bq_assistant = BigQueryHelper(active_project="bigquery-public-data", dataset_name="noaa_icoads") bq_assistant.list_tables() bq_assistant.table_schema('icoads_core_2005') query = """SELECT latitude, longitude, wind_direction_true, wind_speed, present_weather, sea_level_pressure, air_temperature, wetbulb_temperature, sea_surface_temp, total_cloud_amount, cloud_height, wave_direction, wave_period, wave_height, swell_direction, swell_period, swell_height, timestamp FROM `bigquery-public-data.noaa_icoads.icoads_core_2005` """ #execute the query. res = bq_assistant.query_to_pandas_safe(query) #save the result. res.to_csv("resultat_total_20052.csv", sep=',')
to_address, value, receipt_contract_address, receipt_status, block_number FROM `bigquery-public-data.ethereum_blockchain.transactions` AS transactions WHERE TRUE AND block_timestamp < '2017-01-01 00:00:00' """ # This establishes an authenticated session and prepares a reference to the dataset that lives in BigQuery. bq_assistant = BigQueryHelper("bigquery-public-data", "ethereum_blockchain") print("estimated query data size: " + str(bq_assistant.estimate_query_size(query))) df = bq_assistant.query_to_pandas_safe(query, max_gb_scanned=60) df.to_csv( "/home/yueduan/yueduan/postdoc_study/ether_transactions_before2017.csv") # # Import dataset from big query # from google.cloud import bigquery # from bq_helper import BigQueryHelper # import matplotlib.pyplot as plt # import seaborn as sns # import pandas as pd # #%matplotlib inline # plt.style.use('ggplot') # sns.set_context("notebook", font_scale=1.5, rc={"lines.linewidth": 2.5}) # # This establishes an authenticated session and prepares a reference to the dataset that lives in BigQuery.
from google.cloud import bigquery from bq_helper import BigQueryHelper bq_assist = BigQueryHelper(active_project='bigquery-public-data', dataset_name='epa_historical_air_quality') bq_assist.list_tables() bq_assist.head('temperature_daily_summary') query = """ SELECT EXTRACT (YEAR FROM date_local) AS Year, AVG ((arithmetic_mean - 32.0)/ 1.80) AS avg_temp_celcius FROM `bigquery-public-data.epa_historical_air_quality.temperature_daily_summary` GROUP BY Year Order BY Year """ avg_temp = bq_assist.query_to_pandas_safe(query, max_gb_scanned=10) query = """ SELECT EXTRACT (YEAR FROM date_local) AS Year, AVG (arithmetic_mean) AS avg_co FROM `bigquery-public-data.epa_historical_air_quality.co_daily_summary` GROUP BY Year Order BY Year """ avg_co = bq_assist.query_to_pandas_safe(query, max_gb_scanned=10) concat = pd.concat([avg_temp, avg_co.avg_co], axis=1) fig, ax = plt.subplots(figsize=(16, 5)) color = 'tab:blue' ax.bar(concat.Year, concat.avg_temp_celcius, color=color)
# NOTES: # - Need a Google Cloud authentication: # https://cloud.google.com/docs/authentication/getting-started#auth-cloud-implicit-python # - The path to the json auth file you get from the google auth needs to be # set in your PATH. # - This means you're going to have to set it everytime you have a new # shell session. # - bq_helper abstracts retrieving big queries through google cloud for # an easier time: # https://github.com/SohierDane/BigQuery_Helper from bq_helper import BigQueryHelper import pandas as pd print('Retrieving NOAA GSOD data.') bq_assist = BigQueryHelper("bigquery-public-data", "noaa_gsod") print('Successfully retrieved data.') start_yr = 2010 end_yr = 2019 for year in range(start_yr, end_yr): query = "SELECT stn, year, mo AS month, da AS day, temp, slp FROM `bigquery-public-data.noaa_gsod.gsod{}` WHERE wban='23174' OR wban='23129' OR wban='93134' ".format(year) df_weather = bq_assist.query_to_pandas_safe(query, max_gb_scanned=5) filename = 'socal_weather_{}.csv'.format(year) df_weather.to_csv(filename, index=False) print("Saved {}".format(filename))