def main(): path = '../data/states' bq_assistant = BigQueryHelper('bigquery-public-data', 'nhtsa_traffic_fatalities') """ Per State Overview - Fatal accidents per state - Accident Fatalities per state - Fatalities / Accident per state """ QUERY = """ SELECT state_number, state_name, COUNT(consecutive_number) AS accidents, SUM(number_of_fatalities) AS fatalities, SUM(number_of_fatalities) / COUNT(consecutive_number) AS fatalities_per_accident FROM `bigquery-public-data.nhtsa_traffic_fatalities.accident_{0}` GROUP BY state_number, state_name ORDER BY fatalities_per_accident DESC """ accident_fatalities_2015 = bq_assistant.query_to_pandas(QUERY.format(2015)) accident_fatalities_2015['year'] = 2015 accident_fatalities_2016 = bq_assistant.query_to_pandas(QUERY.format(2016)) accident_fatalities_2016['year'] = 2016 accident_fatalities_2015.to_csv(f'{path}/accident_fatalities_2015.csv', index=False) accident_fatalities_2016.to_csv(f'{path}/accident_fatalities_2016.csv', index=False)
class TestBQHelper(unittest.TestCase): def setUp(self): self.my_bq = BigQueryHelper("bigquery-public-data", "openaq") self.query = "SELECT location FROM `bigquery-public-data.openaq.global_air_quality`" # Query randomized so it won't hit the cache across multiple test runs self.randomizable_query = """ SELECT value FROM `bigquery-public-data.openaq.global_air_quality` WHERE value = {0}""" def test_list_tables(self): self.assertEqual(self.my_bq.list_tables(), ['global_air_quality']) def test_list_schema(self): self.assertEqual(len(self.my_bq.table_schema('global_air_quality')), 11) def test_estimate_query_size(self): self.assertIsInstance(self.my_bq.estimate_query_size(self.query), float) def test_query_to_pandas(self): self.assertIsInstance(self.my_bq.query_to_pandas(self.query), DataFrame) def test_query_safe_passes(self): self.assertIsInstance(self.my_bq.query_to_pandas_safe(self.query), DataFrame) def test_query_safe_fails(self): # Different query must be used for this test to ensure we don't hit the # cache and end up passing by testing a query that would use zero bytes. fail_query = self.randomizable_query.format(random()) self.assertIsNone(self.my_bq.query_to_pandas_safe(fail_query, 10**-10)) def test_head(self): self.assertIsInstance(self.my_bq.head('global_air_quality'), DataFrame) def test_useage_tracker(self): self.my_bq.query_to_pandas(self.randomizable_query.format(random())) self.assertNotEqual(self.my_bq.total_gb_used_net_cache, 0) def test_bad_query_raises_right_error(self): with self.assertRaises(BadRequest): self.my_bq.query_to_pandas("Not a valid query") def test_list_nested_schema(self): nested_helper = BigQueryHelper("bigquery-public-data", "github_repos") self.assertEqual(len(nested_helper.table_schema('commits')), 33)
def main(): path = '../data/persons' bq_assistant = BigQueryHelper('bigquery-public-data', 'nhtsa_traffic_fatalities') QUERY = """ SELECT p.state_number, p.consecutive_number, p.vehicle_number, p.person_number, person_type_name AS person_type, age, sex, injury_severity_name AS injury_severity, land_use_name AS land_use, functional_system_name AS trafficway_type, manner_of_collision_name AS manner_of_collision, body_type_name AS body_type, rollover, seating_position_name AS seating_position, ejection_name AS ejection, restraint_system_helmet_use_name AS safety_equipment_use, air_bag_deployed_name AS air_bag_deployed, non_motorist_safety_equipment_use FROM `bigquery-public-data.nhtsa_traffic_fatalities.person_{0}` AS p LEFT JOIN `bigquery-public-data.nhtsa_traffic_fatalities.safetyeq_{0}` AS s ON p.consecutive_number=s.consecutive_number AND p.vehicle_number=s.vehicle_number AND p.person_number=s.person_number ORDER BY consecutive_number, vehicle_number, person_number """ data_2015 = bq_assistant.query_to_pandas(QUERY.format(2015)) data_2016 = bq_assistant.query_to_pandas(QUERY.format(2016)) data_2015.to_csv(f'{path}/person_data_2015.csv', index=False) data_2016.to_csv(f'{path}/person_data_2016.csv', index=False)
import pandas as pd from bq_helper import BigQueryHelper # import plotly.graph_objs as go # from plotly.offline import plot bq_assistant = BigQueryHelper('bigquery-public-data', 'san_francisco') QUERY = """ SELECT call_number, unit_id, call_type, battalion FROM `bigquery-public-data.san_francisco.sffd_service_calls` LIMIT 10000 """ df = bq_assistant.query_to_pandas(QUERY) print(df.head(3)) # trace1 = go.Scatter( # # ) # # trace2 = go.Pie( # # ) # # trace3 = go.Bar( # # ) # data = [trace1] #
import plotly.graph_objs as go from plotly.offline import plot os.environ["GOOGLE_APPLICATION_CREDENTIALS"] = "keys.json" bq_assistant = BigQueryHelper('bigquery-public-data', 'epa_historical_air_quality.no2_daily_summary') QUERY = """ SELECT sample_duration, date_local, units_of_measure, county_name FROM bigquery-public-data.epa_historical_air_quality.no2_daily_summary LIMIT 100 """ df: pandas.core.frame.DataFrame = bq_assistant.query_to_pandas(QUERY) df_group_by_county_name = df.groupby(['county_name']) df.num_of_samples = df_group_by_county_name["sample_duration"].count() trace1 = go.Scatter( x = df.num_of_samples.index, y = df.num_of_samples.values, mode = "lines+markers", name = "num of samples" ) trace2 = go.Pie( labels=df.num_of_samples.index,
QUERY2016 = """ SELECT pollutant.county_name AS County, AVG(pollutant.aqi) AS AvgAQI_pollutant FROM `bigquery-public-data.epa_historical_air_quality.pollutant_daily_summary` as pollutant WHERE pollutant.poc = 1 AND EXTRACT(YEAR FROM pollutant.date_local) = 2016 GROUP BY pollutant.county_name """ df_2016 = None for elem_g in pollutants: query = QUERY2016.replace("pollutant", elem_g) temp = bq_assistant.query_to_pandas(query).set_index('County') df_2016 = pd.concat([df_2016, temp], axis=1, join='outer') df_2016 = df_2016.apply(lambda x: x.fillna(x.mean()), axis=0) df_2016.sample(10, random_state=42) # Okay so now we have the 2016 measures for every county in the US, what can we do about it to retrieve some useful insights ? Well, **clustering** is a natural answer in the context ! # Let's see what t-SNE will give us here : # In[ ]: from sklearn.manifold import TSNE X_tsne = TSNE(n_components=2, n_iter=2000, perplexity=35, random_state=5).fit_transform(df_2016) df_tsne = pd.DataFrame(X_tsne) df_tsne['County'] = list(df_2016.index)
get_ipython().run_line_magic('matplotlib', 'inline') import seaborn as sns plt.style.use('fivethirtyeight') # #### Lets make a query to get a total sum of bitcoins transacted for every day. # In[ ]: q = """ SELECT TIMESTAMP_MILLIS((timestamp - MOD(timestamp, 86400000))) as Timestamp, sum(o.output_satoshis) as output_price from `bigquery-public-data.bitcoin_blockchain.transactions`JOIN UNNEST(outputs) as o group by timestamp """ print(str(round((bq_assistant.estimate_query_size(q)), 2)) + str(" GB")) results3 = bq_assistant.query_to_pandas(q) results3["output_price"] = results3["output_price"].apply( lambda x: float(x / 100000000)) results3 = results3.sort_values(by="Timestamp") results3.head() # ### In this query we will find the total count of transactions per day # In[ ]: q = """ SELECT TIMESTAMP_MILLIS((timestamp - MOD(timestamp, 86400000))) as Timestamp , count(Timestamp) as output_count from `bigquery-public-data.bitcoin_blockchain.transactions` group by timestamp """ print(str(round((bq_assistant.estimate_query_size(q)), 2)) + str(" GB"))
import plotly.graph_objects as go from plotly.offline import plot from bq_helper import BigQueryHelper client = BigQueryHelper('bigquery-public-data','stackoverflow') query = """ select * from bigquery-public-data.stackoverflow.stackoverflow_posts limit 1000 """ df = client.query_to_pandas(query) print(df.head(10)) df_filter = df[['id', 'title', 'comment_count', 'answer_count']][ (df.comment_count>=2) & (df.answer_count>=1) ] print(df_filter.comment_count.max()) print(df_filter[['comment_count']].min()) print(df_filter.head(10)) print(df_filter.size) print(df_filter.count) # trace = go.Scatter(x = df.creation_date, y= df.count_of_posts)
import pandas as pd import numpy as np from google.cloud import bigquery import os from bq_helper import BigQueryHelper os.environ[ "GOOGLE_APPLICATION_CREDENTIALS"] = r"C:\Users\ntang\Downloads\My Project-66567def43dc.json" bq_assistant = BigQueryHelper("bigquery-public-data", "epa_historical_air_quality") pollutants = ['o3', 'no2'] QUERY = """ SELECT pollutant.latitude AS Lat, pollutant.longitude AS Lon, pollutant.sample_measurement AS pollutant, pollutant.date_local as Date, pollutant.time_local as Time FROM `bigquery-public-data.epa_historical_air_quality.pollutant_hourly_summary` as pollutant WHERE pollutant.poc = 1 AND EXTRACT(YEAR FROM pollutant.date_local) BETWEEN 2013 AND 2017 """ df = None for elem_g in pollutants: query = QUERY.replace("pollutant", elem_g) temp = bq_assistant.query_to_pandas(query) df = pd.concat([df, temp], axis=1, join='outer')
} } es.indices.create(index='chicago-taxis', body=body) # query for chicago taxi records using BigQuery chicago_taxis = BigQueryHelper(active_project="bigquery-public-data", dataset_name="chicago_taxi_trips") i = 0 succeeded = 0 failed = 0 while True: # we need to iterate through the table because a single query is too large # we'll query 5000 records at a time query = "SELECT * from `bigquery-public-data.chicago_taxi_trips.taxi_trips` LIMIT 5000 OFFSET {}".format( 5000 * i) df = chicago_taxis.query_to_pandas(query) if df.shape[0] == 0: break # drop any records without a pickup or dropoff location df.dropna(subset=[ 'pickup_latitude', 'pickup_longitude', 'dropoff_latitude', 'dropoff_longitude' ], how='any', inplace=True) # convert the data frame to a list of dicts L = df.fillna('').to_dict(orient='records')
def main(): path = '../data/accidents' bq_assistant = BigQueryHelper('bigquery-public-data', 'nhtsa_traffic_fatalities') QUERY = """ SELECT a.state_number, a.consecutive_number, veh.vehicle_number, state_name, a.hour_of_crash AS hour_of_day, CASE WHEN day_of_week = 1 THEN 'Sunday' WHEN day_of_week = 2 THEN 'Monday' WHEN day_of_week = 3 THEN 'Tuesday' WHEN day_of_week = 4 THEN 'Wednesday' WHEN day_of_week = 5 THEN 'Thursday' WHEN day_of_week = 6 THEN 'Friday' WHEN day_of_week = 7 THEN 'Saturday' END day_of_week, CASE WHEN EXTRACT(MONTH FROM a.timestamp_of_crash) = 1 THEN 'January' WHEN EXTRACT(MONTH FROM a.timestamp_of_crash) = 2 THEN 'February' WHEN EXTRACT(MONTH FROM a.timestamp_of_crash) = 3 THEN 'March' WHEN EXTRACT(MONTH FROM a.timestamp_of_crash) = 4 THEN 'April' WHEN EXTRACT(MONTH FROM a.timestamp_of_crash) = 5 THEN 'May' WHEN EXTRACT(MONTH FROM a.timestamp_of_crash) = 6 THEN 'June' WHEN EXTRACT(MONTH FROM a.timestamp_of_crash) = 7 THEN 'July' WHEN EXTRACT(MONTH FROM a.timestamp_of_crash) = 8 THEN 'August' WHEN EXTRACT(MONTH FROM a.timestamp_of_crash) = 9 THEN 'September' WHEN EXTRACT(MONTH FROM a.timestamp_of_crash) = 10 THEN 'October' WHEN EXTRACT(MONTH FROM a.timestamp_of_crash) = 11 THEN 'November' WHEN EXTRACT(MONTH FROM a.timestamp_of_crash) = 12 THEN 'December' END month, EXTRACT(YEAR FROM a.timestamp_of_crash) AS year, land_use_name AS land_use, national_highway_system, route_signing_name AS roadway_type, type_of_intersection AS intersection, light_condition_name AS light_condition, atmospheric_conditions_1_name AS atmospheric_conditions, latitude, longitude, manner_of_collision_name AS manner_of_collision, number_of_vehicle_forms_submitted_all AS num_vehicles, number_of_persons_not_in_motor_vehicles_in_transport_mvit AS num_nonmotorists, number_of_persons_in_motor_vehicles_in_transport_mvit AS num_motorists, number_of_fatalities AS num_fatalities, number_of_drunk_drivers AS num_drunk_drivers, body_type_name AS body_type, vehicle_model_year AS vehicle_year, rollover, contributing_circumstances_motor_vehicle_name AS vehicle_conditions, previous_dwi_convictions, previous_speeding_convictions, speeding_related, speed_limit, drivers_vision_obscured_by_name AS driver_vision_obscured, FROM `bigquery-public-data.nhtsa_traffic_fatalities.accident_{0}` AS a JOIN `bigquery-public-data.nhtsa_traffic_fatalities.vehicle_{0}` AS veh ON a.consecutive_number=veh.consecutive_number JOIN `bigquery-public-data.nhtsa_traffic_fatalities.factor_{0}` AS f ON a.consecutive_number=f.consecutive_number AND veh.vehicle_number=f.vehicle_number JOIN `bigquery-public-data.nhtsa_traffic_fatalities.vision_{0}` AS v ON a.consecutive_number=v.consecutive_number AND veh.vehicle_number=v.vehicle_number ORDER BY consecutive_number, vehicle_number """ data_2015 = bq_assistant.query_to_pandas(QUERY.format(2015)) data_2016 = bq_assistant.query_to_pandas(QUERY.format(2016)) data_2015.to_csv(f'{path}/accident_driver_data_2015.csv', index=False) data_2016.to_csv(f'{path}/accident_driver_data_2016.csv', index=False)
from bigquery-public-data.covid19_italy.data_by_region group by region_name """ query_for_scatter = """ select date, AVG(total_hospitalized_patients) as hospitalized from bigquery-public-data.covid19_italy.data_by_region group by date order by date """ df = client.query_to_pandas(query) # 3.1 BAR >><< df_for_bar = client.query_to_pandas(query_for_bar) bar = go.Bar(x=df_for_bar.date, y=df_for_bar.hospitalized_patients_symptoms) bar_layout = go.Layout(title='Toscana', xaxis=dict(title='Date'), yaxis=dict(title='Amount')) fig_bar = go.Figure(data=[bar], layout=bar_layout) plot(fig_bar, filename='bar_file') # 3.2 Pie >>><<<