def main():
    noaa_goes = BigQueryHelper(active_project="bigquery-public-data",
                               dataset_name="noaa_goes16")
    noaa_goes.list_tables()

    noaa_goes.table_schema('abi_l1b_radiance')
    print(noaa_goes.head("abi_l1b_radiance", num_rows=10))

    query = """
    SELECT dataset_name, platform_id, scene_id FROM `bigquery-public-data.noaa_goes16.abi_l1b_radiance` WHERE geospatial_westbound_longitude<120 and geospatial_eastbound_longitude>75 and geospatial_northbound_latitude<50 and geospatial_southbound_latitude>30
    """

    print("Query size in GB is %f " % noaa_goes.estimate_query_size(query))
Exemplo n.º 2
0
class TestBQHelper(unittest.TestCase):
    def setUp(self):
        self.my_bq = BigQueryHelper("bigquery-public-data", "openaq")
        self.query = "SELECT location FROM `bigquery-public-data.openaq.global_air_quality`"
        # Query randomized so it won't hit the cache across multiple test runs
        self.randomizable_query = """
            SELECT value FROM `bigquery-public-data.openaq.global_air_quality`
            WHERE value = {0}"""

    def test_list_tables(self):
        self.assertEqual(self.my_bq.list_tables(), ['global_air_quality'])

    def test_list_schema(self):
        self.assertEqual(len(self.my_bq.table_schema('global_air_quality')),
                         11)

    def test_estimate_query_size(self):
        self.assertIsInstance(self.my_bq.estimate_query_size(self.query),
                              float)

    def test_query_to_pandas(self):
        self.assertIsInstance(self.my_bq.query_to_pandas(self.query),
                              DataFrame)

    def test_query_safe_passes(self):
        self.assertIsInstance(self.my_bq.query_to_pandas_safe(self.query),
                              DataFrame)

    def test_query_safe_fails(self):
        # Different query must be used for this test to ensure we don't hit the
        # cache and end up passing by testing a query that would use zero bytes.
        fail_query = self.randomizable_query.format(random())
        self.assertIsNone(self.my_bq.query_to_pandas_safe(fail_query, 10**-10))

    def test_head(self):
        self.assertIsInstance(self.my_bq.head('global_air_quality'), DataFrame)

    def test_useage_tracker(self):
        self.my_bq.query_to_pandas(self.randomizable_query.format(random()))
        self.assertNotEqual(self.my_bq.total_gb_used_net_cache, 0)

    def test_bad_query_raises_right_error(self):
        with self.assertRaises(BadRequest):
            self.my_bq.query_to_pandas("Not a valid query")

    def test_list_nested_schema(self):
        nested_helper = BigQueryHelper("bigquery-public-data", "github_repos")
        self.assertEqual(len(nested_helper.table_schema('commits')), 33)
Exemplo n.º 3
0
import pandas as pd
import bq_helper
from bq_helper import BigQueryHelper
# https://www.kaggle.com/sohier/introduction-to-the-bq-helper-package
medicare = bq_helper.BigQueryHelper(active_project="bigquery-public-data",
                                    dataset_name="cms_medicare")
bq_assistant = BigQueryHelper("bigquery-public-data", "cms_medicare")
bq_assistant.list_tables()
bq_assistant.head("inpatient_charges_2015", num_rows=15)
Exemplo n.º 4
0
import bq_helper
from bq_helper import BigQueryHelper
stackOverflow = bq_helper.BigQueryHelper(active_project="bigquery-public-data", dataset_name="stackoverflow")
bq_assistant = BigQueryHelper("bigquery-public-data", "stackoverflow")

tabelas = bq_assistant.list_tables()
esquemas_tabelas = {}

query = """
SELECT Year, Tag, Total, Percent_Questions_with_Answers
FROM (SELECT EXTRACT(YEAR FROM a.creation_date) as Year, t.tag_name as Tag, COUNT(1) as Total, ROUND(100 * SUM(IF(a.answer_count > 0, 1, 0)) / COUNT(*), 1) AS Percent_Questions_with_Answers
FROM `bigquery-public-data.stackoverflow.posts_questions` a right JOIN `bigquery-public-data.stackoverflow.tags` t ON t.tag_name in UNNEST(SPLIT(a.tags,'|'))
GROUP BY Year, Tag
HAVING
  Year > 2019 AND Year < 2021
ORDER BY
  Total DESC
LIMIT 20)
ORDER BY Percent_Questions_with_Answers DESC
"""
response = stackOverflow.query_to_pandas_safe(query, max_gb_scanned=20)
response.head(20)

query1 = "SELECT tag_name as Assunto, count as Num_perguntas FROM `bigquery-public-data.stackoverflow.tags` order BY count DESC"
response1 = stackOverflow.query_to_pandas_safe(query1, max_gb_scanned=20)
response1.head(20)

query2 = """SELECT
  Day_of_Week,
  COUNT(1) AS Num_Questions,
  SUM(answered_in_1h) AS Num_Answered_in_1H,
Exemplo n.º 5
0
from bq_helper import BigQueryHelper
import os
credentials = GoogleCredentials.get_application_default()
os.environ[
    "GOOGLE_APPLICATION_CREDENTIALS"] = 'C:\\Users\\lukek\\Desktop\\SQL\\lschlab2weather-495aae5d3687.json'
os.chdir(r'C:\Users\lukek\Desktop\SQL\huge datasets')
GOOGLE_APPLICATION_CREDENTIALS = 'C:\\Users\\lukek\\Desktop\\SQL\\lschlab2weather-495aae5d3687.json'

# create a helper object for our bigquery dataset
bqh = bq_helper.BigQueryHelper(active_project="bigquery-public-data",
                               dataset_name="noaa_gsod")

# build and run a series of queries to get annual temperatures for the US
# WARNING: each year takes 5+ mins to run and the resultant dataset is about 100MB!
weather = BigQueryHelper("bigquery-public-data", "noaa_gsod")
weather.list_tables()

import time
start = time.time()
START_YEAR = 2017
END_YEAR = 2020

for year in range(START_YEAR, END_YEAR):
    query = "SELECT stn,year,mo,da,temp,dewp,slp,stp,visib,wdsp,mxpsd,gust,max,min,prcp,sndp,fog,rain_drizzle,snow_ice_pellets,hail,thunder,tornado_funnel_cloud FROM `bigquery-public-data.noaa_gsod.gsod{}`".format(
        year)
    df_wthr = bqh.query_to_pandas_safe(query, max_gb_scanned=5)
    filename = 'US_weather_{}.csv'.format(year)
    df_wthr.to_csv(filename, index=False)
    print("Saved {}".format(filename))

print('It took', time.time() - start, 'seconds.')