def extract_data(): try: # Connect to BigQuery API bqclient = bigquery.Client() bqstorageclient = bigquery_storage_v1beta1.BigQueryStorageClient() logging.info('BigQuery successfully authenticated') except: logging.info('BigQuery authentication unsuccessful') # Actual query is here. query_string = """ SELECT * FROM DATABASE.TABLE_NAME """ try: # Create a pandas dataframe dataframe = (bqclient.query(query_string).result().to_dataframe( bqstorage_client=bqstorageclient)) except: logging.log('Could not successfully extract data') dataframe.to_csv(today + '_report.csv', index=False) logging.log('Query extracted successfully')
def check_big_query(ArchivoJson): try: credentials = service_account.Credentials.from_service_account_file( ArchivoJson) print("cred") scoped_credentials = credentials.with_scopes( ['https://www.googleapis.com/auth/cloud-platform']) print("scoped_credentials") bqclient = bigquery.Client(credentials=scoped_credentials, project="firm-aviary-195016") print("bqclient") bqstorageclient = bigquery_storage_v1beta1.BigQueryStorageClient( credentials=scoped_credentials) print("bqstorageclient") query = "SELECT totals.timeOnSite AS Tiempo, FullVisitorID FROM `firm-aviary-195016.81564101.ga_sessions_*`WHERE _TABLE_SUFFIX='20190101'" print("query") dataframe = (bqclient.query(query).result().to_dataframe( bqstorage_client=bqstorageclient)) print("bigquery: successful") except ImportError as error: print("Import error: ", error) except TransportError as error: print("Import error: ", error) except: print("Unexpected error: ", sys.exc_info()[0])
def clients(): # [START bigquerystorage_pandas_tutorial_all] # [START bigquerystorage_pandas_tutorial_create_client] import google.auth from google.cloud import bigquery from google.cloud import bigquery_storage_v1beta1 # Explicitly create a credentials object. This allows you to use the same # credentials for both the BigQuery and BigQuery Storage clients, avoiding # unnecessary API calls to fetch duplicate authentication tokens. credentials, your_project_id = google.auth.default( scopes=["https://www.googleapis.com/auth/cloud-platform"] ) # Make clients. bqclient = bigquery.Client( credentials=credentials, project=your_project_id, ) bqstorageclient = bigquery_storage_v1beta1.BigQueryStorageClient( credentials=credentials ) # [END bigquerystorage_pandas_tutorial_create_client] # [END bigquerystorage_pandas_tutorial_all] return bqclient, bqstorageclient
def fetch_data(project): bq_client = bigquery.Client(project=project) bq_storage_client = bigquery_storage_v1beta1.BigQueryStorageClient() IMG_DIR.mkdir(exist_ok=True) desktop_data = (bq_client.query(DESKTOP_QUERY).result().to_dataframe( bqstorage_client=bq_storage_client)) desktop_user_state_data = ( bq_client.query(DESKTOP_USER_STATE_QUERY).result().to_dataframe( bqstorage_client=bq_storage_client)) desktop_data = pd.merge(desktop_data, desktop_user_state_data, on=["date", "country"]) desktop_data["year"] = pd.DatetimeIndex(desktop_data["date"]).year desktop_data["doy"] = pd.DatetimeIndex(desktop_data["date"]).dayofyear desktop_data["fakedate"] = [ pd.to_datetime("20170101") + timedelta(days=x) for x in desktop_data["doy"] ] desktop_data["DAU_MA7d"] = desktop_data.groupby( "country")["DAU"].transform(lambda x: x.rolling(window=7).mean()) desktop_data["MAU_base"] = desktop_data.groupby(["country", "year" ]).MAU.transform("first") desktop_data["DAU_MA7d_base"] = desktop_data.groupby( ["country", "year"]).DAU_MA7d.transform("first") desktop_data["dau_pcnt_Jan01"] = (desktop_data["DAU_MA7d"] / desktop_data["DAU_MA7d_base"]) desktop_data[ "mau_pcnt_Jan01"] = desktop_data["MAU"] / desktop_data["MAU_base"] return desktop_data
def googlebq_patents(label_list, keywords, topn=4): import google.auth from google.cloud import bigquery from google.cloud import bigquery_storage_v1beta1 credentials, project_id = google.auth.default( scopes=["https://www.googleapis.com/auth/cloud-platform"] ) # Make clients. bqclient = bigquery.Client(credentials=credentials, project=project_id) bqstorageclient = bigquery_storage_v1beta1.BigQueryStorageClient(credentials=credentials) query = search_query(label_list, keywords) df_search = ( bqclient.query(query) .result() .to_dataframe(bqstorage_client=bqstorageclient) ) df_search['patent_date'] = pd.to_numeric(df_search['patent_date'], errors='coerce').astype(np.int32) df_search.to_csv('./flaskapp/static/data/search_result.csv') key_query = keywords[0:4] return key_query, df_search
def _read_features_from_bigquery(self, read_table_name: str) -> pd.DataFrame: self._logger.info(f"Reading from {read_table_name}") query = """ select countif(text_token=119) as n_text_token_119, countif(text_token=188) as n_text_token_188, countif(text_token=11170) as n_text_token_11170, countif(text_token=14120) as n_text_token_14120, countif(text_token=120) as n_text_token_120, countif(text_token=146) as n_text_token_146, countif(text_token=18628) as n_text_token_18628, countif(text_token=10111) as n_text_token_10111, countif(text_token=15221) as n_text_token_15221, countif(text_token=1881) as n_text_token_1881, countif(text_token=11662) as n_text_token_11662, from {} cross join unnest(text_tokens) as text_token group by tweet_id, engaging_user_id order by tweet_id, engaging_user_id """.format(read_table_name) if self.debugging: query += " limit 10000" bqclient = bigquery.Client(project=self.PROJECT_ID) bqstorageclient = bigquery_storage_v1beta1.BigQueryStorageClient() df = (bqclient.query(query).result().to_dataframe( bqstorage_client=bqstorageclient)) return df
def _read_features_from_bigquery(self, read_table_name: str, read_text_name: str) -> pd.DataFrame: self._logger.info(f"Reading from {read_table_name}") query = """ WITH text_length AS ( SELECT tweet_id, length(regexp_replace(regexp_replace(text, r"https?://[w/:%#$&?()~.=+-…]+", ""), r'(@[a-zA-Z0-9_]{{1,15}})', "")) as removed_html_at FROM {} ) SELECT B.removed_html_at FROM {} AS A LEFT OUTER JOIN text_length AS B ON A.tweet_id = B.tweet_id ORDER BY A.tweet_id, A.engaging_user_id """.format(read_text_name, read_table_name) if self.debugging: query += " limit 10000" bqclient = bigquery.Client(project=self.PROJECT_ID) bqstorageclient = bigquery_storage_v1beta1.BigQueryStorageClient() df = (bqclient.query(query).result().to_dataframe( bqstorage_client=bqstorageclient)) return df
def _make_bqstorage_client(use_bqstorage_api, credentials): if not use_bqstorage_api: return None try: from google.cloud import bigquery_storage_v1beta1 except ImportError as err: customized_error = ImportError( "Install the google-cloud-bigquery-storage and pyarrow packages " "to use the BigQuery Storage API." ) six.raise_from(customized_error, err) try: from google.api_core.gapic_v1 import client_info as gapic_client_info except ImportError as err: customized_error = ImportError( "Install the grpcio package to use the BigQuery Storage API." ) six.raise_from(customized_error, err) return bigquery_storage_v1beta1.BigQueryStorageClient( credentials=credentials, client_info=gapic_client_info.ClientInfo(user_agent=IPYTHON_USER_AGENT), )
def _read_features_from_bigquery(self, read_table_name: str, read_text_name: str) -> pd.DataFrame: self._logger.info(f"Reading from {read_table_name}") query = """ WITH sentence_count AS ( SELECT tweet_id, array_length(split(text, "。")) sen_cnt_0, array_length(split(regexp_replace(text, r"https?://[\w!?/\+\-_~=;\.,*&@#$%\(\)\'\[\]]+", ""), ".")) sen_cnt_1, FROM {} ) SELECT B.sen_cnt_0, B.sen_cnt_1, FROM {} AS A LEFT OUTER JOIN sentence_count AS B ON A.tweet_id = B.tweet_id ORDER BY A.tweet_id, A.engaging_user_id """.format(read_text_name, read_table_name) if self.debugging: query += " limit 10000" bqclient = bigquery.Client(project=self.PROJECT_ID) bqstorageclient = bigquery_storage_v1beta1.BigQueryStorageClient() df = (bqclient.query(query).result().to_dataframe( bqstorage_client=bqstorageclient)) return df
def _read_features_from_bigquery(self) -> pd.DataFrame: query = """ WITH cumulative AS ( SELECT row_id, -- leakを防ぐために, 過去レコードから現時点の1つ前のレコードまでを計算範囲とする SUM(1) OVER (PARTITION BY user_id, part ORDER BY timestamp ROWS BETWEEN 100 PRECEDING AND 1 PRECEDING) AS cumlative_user_appearance, SUM(answered_correctly) OVER (PARTITION BY user_id, part ORDER BY timestamp ROWS BETWEEN 100 PRECEDING AND 1 PRECEDING) AS cumlative_user_corrected_answers, FROM `wantedly-individual-shu.riiid.train_questions` ) SELECT cumlative_user_appearance, cumlative_user_corrected_answers, cumlative_user_corrected_answers / cumlative_user_appearance AS mean_user_accuracy, FROM cumulative """ query += " order by row_id" if self.debugging: query += " limit 10000" self._logger.info(f"{query}") bqclient = bigquery.Client(project=self.PROJECT_ID) bqstorageclient = bigquery_storage_v1beta1.BigQueryStorageClient() df = ( bqclient.query(query) .result() .to_dataframe(bqstorage_client=bqstorageclient) ) print(df) return df
def _read_text_from_bigquery(self, train_text_name: str, test_text_name: str) -> pd.DataFrame: self._logger.info( f"Reading from {train_text_name} and {test_text_name}") query = """ ( SELECT tweet_id, regexp_replace(regexp_replace(text, r"https?://[w/:%#$&?()~.=+-…]+", ""), r'(@[a-zA-Z0-9_]{{1,15}})', "") as text, FROM {} WHERE language = '22C448FF81263D4BAF2A176145EE9EAD' -- japanese ) UNION ALL ( SELECT tweet_id, regexp_replace(regexp_replace(text, r"https?://[w/:%#$&?()~.=+-…]+", ""), r'(@[a-zA-Z0-9_]{{1,15}})', "") as text, FROM {} WHERE language = '22C448FF81263D4BAF2A176145EE9EAD' -- japanese ) """.format(train_text_name, test_text_name) if self.debugging: query += " limit 10000" bqclient = bigquery.Client(project=self.PROJECT_ID) bqstorageclient = bigquery_storage_v1beta1.BigQueryStorageClient() df = (bqclient.query(query).result().to_dataframe( bqstorage_client=bqstorageclient)) return df
def _read_features_from_bigquery(self) -> pd.DataFrame: query = """ WITH train_only_questions AS ( SELECT row_id, prior_question_elapsed_time, FROM riiid.train WHERE content_type_id = 0 ) SELECT prior_question_elapsed_time, FROM train_only_questions """ query += " order by row_id" if self.debugging: query += " limit 10000" self._logger.info(f"{query}") bqclient = bigquery.Client(project=self.PROJECT_ID) bqstorageclient = bigquery_storage_v1beta1.BigQueryStorageClient() df = (bqclient.query(query).result().to_dataframe( bqstorage_client=bqstorageclient)) return df
def _read_features_from_bigquery(self) -> pd.DataFrame: query = """ WITH train_only_questions AS ( SELECT row_id, content_id, prior_question_had_explanation, FROM riiid.train WHERE content_type_id = 0 ) SELECT questions.part, CASE WHEN train_only_questions.prior_question_had_explanation IS TRUE THEN 1 ELSE 0 END AS prior_question_had_explanation, FROM train_only_questions LEFT OUTER JOIN riiid.questions ON train_only_questions.content_id = questions.question_id """ query += " order by row_id" if self.debugging: query += " limit 10000" self._logger.info(f"{query}") bqclient = bigquery.Client(project=self.PROJECT_ID) bqstorageclient = bigquery_storage_v1beta1.BigQueryStorageClient() df = (bqclient.query(query).result().to_dataframe( bqstorage_client=bqstorageclient)) return df
def __init__(self, stmt, num_streams): self.stmt = stmt self.num_streams = num_streams self.bq_client = bigquery.Client(project=self.PROJECT) self.bq_storage_client = bigquery_storage_v1beta1.BigQueryStorageClient( ) self.run()
def kangkang_apply_for_query(): credentials, your_project_id = google.auth.default(scopes=[ "https://console.cloud.google.com/bigquery?project=ibg-data&refresh=1&_ga=2.26329012.-885081645.1553697324&_gac=1.48956372.1553697398.CjwKCAjwvuzkBRAhEiwA9E3FUm99SDlI5ulzXqDrkiZTFAN8f1W3SQOxiapECsStEySDGqfwrQn8ChoCmgkQAvD_BwE&pli=1&authuser=1" ]) # Make clients. bqclient = bigquery.Client(credentials=credentials, project=your_project_id) bqstorageclient = bigquery_storage_v1beta1.BigQueryStorageClient( credentials=credentials)
def _read_features_from_bigquery(self) -> pd.DataFrame: query = """ WITH train AS ( SELECT row_id, user_id, content_id, LAG(content_id, 1) OVER(PARTITION BY user_id ORDER BY timestamp) AS prior_content_id, FROM `wantedly-individual-shu.riiid.train_questions` AS train_questions ) SELECT CASE WHEN SQRT(POW(user_vec.NMF_0, 2) + POW(user_vec.NMF_1, 2) + POW(user_vec.NMF_2, 2) + POW(user_vec.NMF_3, 2) + POW(user_vec.NMF_4, 2) + POW(user_vec.NMF_5, 2) + POW(user_vec.NMF_6, 2) + POW(user_vec.NMF_7, 2) + POW(user_vec.NMF_8, 2) + POW(user_vec.NMF_9, 2)) = 0 OR SQRT(POW(content_vec.NMF_0, 2) + POW(content_vec.NMF_1, 2) + POW(content_vec.NMF_2, 2) + POW(content_vec.NMF_3, 2) + POW(content_vec.NMF_4, 2) + POW(content_vec.NMF_5, 2) + POW(content_vec.NMF_6, 2) + POW(content_vec.NMF_7, 2) + POW(content_vec.NMF_8, 2) + POW(content_vec.NMF_9, 2)) = 0 THEN -999 ELSE ( user_vec.NMF_0 * content_vec.NMF_0 + user_vec.NMF_1 * content_vec.NMF_1 + user_vec.NMF_2 * content_vec.NMF_2 + user_vec.NMF_3 * content_vec.NMF_3 + user_vec.NMF_4 * content_vec.NMF_4 + user_vec.NMF_5 * content_vec.NMF_5 + user_vec.NMF_6 * content_vec.NMF_6 + user_vec.NMF_7 * content_vec.NMF_7 + user_vec.NMF_8 * content_vec.NMF_8 + user_vec.NMF_9 * content_vec.NMF_9 ) / SQRT(POW(user_vec.NMF_0, 2) + POW(user_vec.NMF_1, 2) + POW(user_vec.NMF_2, 2) + POW(user_vec.NMF_3, 2) + POW(user_vec.NMF_4, 2) + POW(user_vec.NMF_5, 2) + POW(user_vec.NMF_6, 2) + POW(user_vec.NMF_7, 2) + POW(user_vec.NMF_8, 2) + POW(user_vec.NMF_9, 2)) / SQRT(POW(content_vec.NMF_0, 2) + POW(content_vec.NMF_1, 2) + POW(content_vec.NMF_2, 2) + POW(content_vec.NMF_3, 2) + POW(content_vec.NMF_4, 2) + POW(content_vec.NMF_5, 2) + POW(content_vec.NMF_6, 2) + POW(content_vec.NMF_7, 2) + POW(content_vec.NMF_8, 2) + POW(content_vec.NMF_9, 2)) END AS similarity, FROM train AS train LEFT OUTER JOIN `wantedly-individual-shu.riiid.content_embedding_lda10` AS content_vec ON train.content_id = content_vec.content_id LEFT OUTER JOIN `wantedly-individual-shu.riiid.content_embedding_lda10` AS user_vec ON train.prior_content_id = user_vec.content_id """ query += " order by row_id" if self.debugging: query += " limit 10000" self._logger.info(f"{query}") bqclient = bigquery.Client(project=self.PROJECT_ID) bqstorageclient = bigquery_storage_v1beta1.BigQueryStorageClient() df = ( bqclient.query(query) .result() .to_dataframe(bqstorage_client=bqstorageclient) ) print(df) return df
def _read_inter_table_from_bigquery(self, table_name: str) -> pd.DataFrame: self._logger.info(f"Reading from {table_name}") query = """ WITH subset AS ( SELECT tweet_id, engaging_user_id, any_value(hashtags) AS hashtags, any_value(reply_engagement_timestamp) AS reply_engagement_timestamp, any_value(retweet_engagement_timestamp) AS retweet_engagement_timestamp, any_value(retweet_with_comment_engagement_timestamp) AS retweet_with_comment_engagement_timestamp, any_value(like_engagement_timestamp) AS like_engagement_timestamp, FROM {} GROUP BY tweet_id, engaging_user_id ) , unnest_subset AS ( SELECT tweet_id, engaging_user_id, hashtag, reply_engagement_timestamp, retweet_engagement_timestamp, retweet_with_comment_engagement_timestamp, like_engagement_timestamp FROM subset, unnest(hashtags) AS hashtag ) , use_combination AS ( SELECT hashtag, engaging_user_id, COUNT(*) AS cnt FROM unnest_subset GROUP BY hashtag, engaging_user_id HAVING COUNT(*) >= 1 ) SELECT unnest_subset.tweet_id, unnest_subset.engaging_user_id, unnest_subset.hashtag, CASE WHEN unnest_subset.reply_engagement_timestamp IS NULL THEN 0 ELSE 1 END AS reply_engagement, CASE WHEN unnest_subset.retweet_engagement_timestamp IS NULL THEN 0 ELSE 1 END AS retweet_engagement, CASE WHEN unnest_subset.retweet_with_comment_engagement_timestamp IS NULL THEN 0 ELSE 1 END AS retweet_with_comment_engagement, CASE WHEN unnest_subset.like_engagement_timestamp IS NULL THEN 0 ELSE 1 END AS like_engagement, FROM unnest_subset INNER JOIN use_combination ON unnest_subset.hashtag = use_combination.hashtag AND unnest_subset.engaging_user_id = use_combination.engaging_user_id """.format(table_name) if self.debugging: query += " limit 10000" bqclient = bigquery.Client(project=self.PROJECT_ID) bqstorageclient = bigquery_storage_v1beta1.BigQueryStorageClient() df = (bqclient.query(query).result().to_dataframe( bqstorage_client=bqstorageclient)) return df
def _read_features_from_bigquery(self) -> pd.DataFrame: query = """ WITH train AS ( SELECT content_id, val, answered_correctly, row_number() over(partition by user_id, content_id order by timestamp) as rank FROM `wantedly-individual-shu.riiid.train_questions` ), aggregation_per_content AS ( SELECT content_id, CASE WHEN AVG(answered_correctly) = 0 THEN 999 ELSE (1 - AVG(answered_correctly)) / AVG(answered_correctly) END AS weight FROM train WHERE val = 0 -- use only train AND rank = 1 -- 各ユーザの1回目の正解データのみ使う GROUP BY content_id ), cumulative AS ( SELECT row_id, -- leakを防ぐために, 過去レコードから現時点の1つ前のレコードまでを計算範囲とする SUM(aggregation_per_content.weight) OVER (PARTITION BY user_id ORDER BY timestamp ROWS BETWEEN 100 PRECEDING AND 1 PRECEDING) AS cumlative_user_appearance, SUM(answered_correctly * aggregation_per_content.weight) OVER (PARTITION BY user_id ORDER BY timestamp ROWS BETWEEN 100 PRECEDING AND 1 PRECEDING) AS cumlative_user_corrected_answers, FROM `wantedly-individual-shu.riiid.train_questions` AS train_questions LEFT OUTER JOIN aggregation_per_content ON train_questions.content_id = aggregation_per_content.content_id ) SELECT cumlative_user_appearance, cumlative_user_corrected_answers, cumlative_user_corrected_answers / cumlative_user_appearance AS mean_user_accuracy, FROM cumulative """ query += " order by row_id" if self.debugging: query += " limit 10000" self._logger.info(f"{query}") bqclient = bigquery.Client(project=self.PROJECT_ID) bqstorageclient = bigquery_storage_v1beta1.BigQueryStorageClient() df = (bqclient.query(query).result().to_dataframe( bqstorage_client=bqstorageclient)) print(df) return df
def _read_features_from_bigquery(self, train_table_name: str, test_table_name: str, read_table_name: str) -> pd.DataFrame: self._logger.info(f"Reading from {read_table_name}") query = """ WITH subset AS ( SELECT engaged_user_id, engaging_user_id, count(1) as n_pair, FROM ( ( SELECT engaged_user_id, engaging_user_id FROM {} ) UNION ALL ( SELECT engaged_user_id, engaging_user_id FROM {} ) ) GROUP BY engaged_user_id, engaging_user_id ), agg AS ( select org.engaged_user_id, org.engaging_user_id, IFNULL(org.n_pair + rev.n_pair, org.n_pair) AS n_reciprocal_pair, FROM subset AS org LEFT JOIN subset AS rev ON org.engaged_user_id = rev.engaging_user_id AND org.engaging_user_id = rev.engaged_user_id ) SELECT -- A.engaged_user_id, -- A.engaging_user_id, B.n_reciprocal_pair, FROM {} AS A LEFT OUTER JOIN agg AS B ON A.engaged_user_id = B.engaged_user_id AND A.engaging_user_id = B.engaging_user_id ORDER BY A.tweet_id, B.engaging_user_id """.format(train_table_name, test_table_name, read_table_name) if self.debugging: query += " limit 10000" bqclient = bigquery.Client(project=self.PROJECT_ID) bqstorageclient = bigquery_storage_v1beta1.BigQueryStorageClient() df = (bqclient.query(query).result().to_dataframe( bqstorage_client=bqstorageclient)) return df
def _read_hashtags_count_from_bigquery( self, train_table_name: str, test_table_name) -> pd.DataFrame: self._logger.info(f"Reading from {train_table_name} and {test_table_name}") query = """ WITH subset AS ( ( SELECT tweet_id, any_value(hashtags) AS hashtags FROM {} GROUP BY tweet_id ) UNION ALL ( SELECT tweet_id, any_value(hashtags) AS hashtags FROM {} GROUP BY tweet_id ) ) , unnest_subset AS ( SELECT tweet_id, hashtag FROM subset, unnest(hashtags) AS hashtag ) , count_hashtag AS ( SELECT hashtag, COUNT(*) AS cnt FROM unnest_subset GROUP BY hashtag ) SELECT tweet_id, AVG(cnt) AS mean_value, min(cnt) AS min_value, max(cnt) AS max_value, case when stddev(cnt) is null then 1 else stddev(cnt) end AS std_value FROM ( SELECT A.tweet_id, A.hashtag, B.cnt FROM unnest_subset AS A LEFT OUTER JOIN count_hashtag AS B ON A.hashtag = B.hashtag ) GROUP BY tweet_id """.format(train_table_name, test_table_name) if self.debugging: query += " limit 10000" bqclient = bigquery.Client(project=self.PROJECT_ID) bqstorageclient = bigquery_storage_v1beta1.BigQueryStorageClient() df = ( bqclient.query(query) .result() .to_dataframe(bqstorage_client=bqstorageclient) ) return df
def _make_bqstorage_client(use_bqstorage_api, credentials): if not use_bqstorage_api: return None if bigquery_storage_v1beta1 is None: raise ImportError( "Install the google-cloud-bigquery-storage and fastavro packages " "to use the BigQuery Storage API.") return bigquery_storage_v1beta1.BigQueryStorageClient( credentials=credentials)
def get_bigstorageclient(): """ Create Big query Storage client utilizaing variable GOOGLE_APPLLICATION_CREDENTIALS :return: Big Query Storage Client """ os.environ['GOOGLE_APPLICATION_CREDENTIALS'] = Variable.get( 'GOOGLE_APPLICATION_CREDENTIALS') credentials, my_project_id = google.auth.default( scopes=['https://www.googleapis.com/auth/cloud-platform']) bqstorageclient = bigquery_storage_v1beta1.BigQueryStorageClient( credentials=credentials) return bqstorageclient
def _read_count_tweet_type_count_from_bigquery( self, train_table_name: str, test_table_name: str, read_table_name: str) -> pd.DataFrame: self._logger.info(f"Reading from {read_table_name}") query = """ WITH subset AS ( ( SELECT tweet_id, any_value(engaged_user_id) as engaged_user_id, any_value(tweet_type) as tweet_type FROM {} GROUP BY tweet_id ) UNION ALL ( SELECT tweet_id, any_value(engaged_user_id) as engaged_user_id, any_value(tweet_type) as tweet_type FROM {} GROUP BY tweet_id ) ) , count_tweet_type AS ( SELECT engaged_user_id, countif(tweet_type = "Quote") as n_quote_tweet, countif(tweet_type = "Retweet") as n_retweet_tweet, countif(tweet_type = "Quote") + countif(tweet_type = "Retweet") as n_quote_plus_retweet, countif(tweet_type = "Quote") / count(*) as ratio_quote_tweet, countif(tweet_type = "Retweet") / count(*) as ratio_retweet_tweet, (countif(tweet_type = "Quote") + countif(tweet_type = "Retweet")) / count(*) as ratio_quote_plus_retweet FROM subset GROUP BY engaged_user_id ) SELECT -- t0.engaged_user_id, -- t0.engaging_user_id, f1.n_quote_tweet, f1.n_retweet_tweet, f1.n_quote_plus_retweet, f1.ratio_quote_tweet, f1.ratio_retweet_tweet, f1.ratio_quote_plus_retweet, FROM {} AS t0 LEFT OUTER JOIN count_tweet_type AS f1 ON t0.engaging_user_id = f1.engaged_user_id ORDER BY t0.tweet_id, t0.engaging_user_id """.format(train_table_name, test_table_name, read_table_name) if self.debugging: query += " limit 10000" bqclient = bigquery.Client(project=self.PROJECT_ID) bqstorageclient = bigquery_storage_v1beta1.BigQueryStorageClient() df = (bqclient.query(query).result().to_dataframe( bqstorage_client=bqstorageclient)) return df
def _read_features_from_bigquery(self, read_table_name: str) -> pd.DataFrame: self._logger.info(f"Reading from {read_table_name}") query = """ WITH response_times AS ( SELECT tweet_id, engaging_user_id, CASE WHEN n_engagement = 0 THEN NULL ELSE ( IF(like_response_time IS NOT NULL, like_response_time, 0) + IF(reply_response_time IS NOT NULL, reply_response_time, 0) + IF(retweet_response_time IS NOT NULL, retweet_response_time, 0) + IF(retweet_with_comment_response_time IS NOT NULL, retweet_with_comment_response_time, 0) ) / n_engagement END AS avg_response_time FROM ( SELECT tweet_id, engaging_user_id, TIMESTAMP_DIFF(TIMESTAMP_SECONDS(like_engagement_timestamp), TIMESTAMP_SECONDS(timestamp), MINUTE) AS like_response_time, TIMESTAMP_DIFF(TIMESTAMP_SECONDS(reply_engagement_timestamp), TIMESTAMP_SECONDS(timestamp), MINUTE) AS reply_response_time, TIMESTAMP_DIFF(TIMESTAMP_SECONDS(retweet_engagement_timestamp), TIMESTAMP_SECONDS(timestamp), MINUTE) AS retweet_response_time, TIMESTAMP_DIFF(TIMESTAMP_SECONDS(retweet_with_comment_engagement_timestamp), TIMESTAMP_SECONDS(timestamp), MINUTE) AS retweet_with_comment_response_time, IF(like_engagement_timestamp IS NOT NULL, 1, 0) + IF(reply_engagement_timestamp IS NOT NULL, 1, 0) + IF(retweet_engagement_timestamp IS NOT NULL, 1, 0) + IF(retweet_with_comment_engagement_timestamp IS NOT NULL, 1, 0) AS n_engagement FROM {} ) ) SELECT tweet_id, engaging_user_id, avg_response_time FROM response_times ORDER BY tweet_id, engaging_user_id """.format(read_table_name) if self.debugging: query += " limit 10000" bqclient = bigquery.Client(project=self.PROJECT_ID) bqstorageclient = bigquery_storage_v1beta1.BigQueryStorageClient() df = ( bqclient.query(query) .result() .to_dataframe(bqstorage_client=bqstorageclient) ) return df
def __init__( self, timezone='US/Eastern', service_account_path=os.environ['GOOGLE_APPLICATION_CREDENTIALS']): self.timezone = timezone credentials = service_account.Credentials.from_service_account_file( service_account_path) self.project = credentials.project_id self.client = bgq.Client(credentials=credentials, project=credentials.project_id) self.bqstorage_client = bigquery_storage_v1beta1.BigQueryStorageClient( credentials=credentials) self.storage_client = storage.Client(credentials=credentials, project=credentials.project_id)
def _read_activity_time_from_bigquery(self, train_table_name: str, test_table_name) -> pd.DataFrame: self._logger.info( f"Reading from {train_table_name} and {test_table_name}") query = """ WITH subset AS ( ( SELECT tweet_id, any_value(language) AS language, any_value(Timestamp) AS Timestamp FROM {} GROUP BY tweet_id ) UNION ALL ( SELECT tweet_id, any_value(language) AS language, any_value(Timestamp) AS Timestamp FROM {} GROUP BY tweet_id ) ), agg AS ( SELECT language, hour, count(*) AS cnt FROM ( SELECT tweet_id, language, EXTRACT(HOUR FROM TIMESTAMP_SECONDS(Timestamp)) AS hour FROM subset ) GROUP BY language, hour ) SELECT A.language, A.hour, A.cnt / B.total_cnt AS activity_ratio FROM agg AS A INNER JOIN ( SELECT language, sum(cnt) as total_cnt FROM agg GROUP BY language ) AS B ON A.language = B.language """.format(train_table_name, test_table_name) if self.debugging: query += " limit 10000" bqclient = bigquery.Client(project=self.PROJECT_ID) bqstorageclient = bigquery_storage_v1beta1.BigQueryStorageClient() df = (bqclient.query(query).result().to_dataframe( bqstorage_client=bqstorageclient)) return df
def __init__(self, **kwargs): self.config_dict = self.override_defaults(**kwargs) os.environ["GOOGLE_APPLICATION_CREDENTIALS"] = self.config_dict[ "google_application_credentials"] os.environ["GCLOUD_PROJECT"] = self.config_dict["gcloud_project"] # https://cloud.google.com/bigquery/docs/bigquery-storage-python-pandas credentials, your_project_id = google.auth.default( scopes=["https://www.googleapis.com/auth/cloud-platform"]) self.client = bigquery.Client(credentials=credentials, project=your_project_id) self.bqstorageclient = bigquery_storage_v1beta1.BigQueryStorageClient( credentials=credentials)
def _read_from_bigquery(self, table_name: str) -> pd.DataFrame: self._logger.info(f"Reading from {table_name}") query = """ select {} from {} order by tweet_id, engaging_user_id """.format(", ".join(self.import_columns()), table_name) if self.debugging: query += " limit 10000" bqclient = bigquery.Client(project=PROJECT_ID) bqstorageclient = bigquery_storage_v1beta1.BigQueryStorageClient() df = (bqclient.query(query).result().to_dataframe( bqstorage_client=bqstorageclient)) return df
def _make_bqstorage_client(use_bqstorage_api, credentials): if not use_bqstorage_api: return None if bigquery_storage_v1beta1 is None: raise ImportError( "Install the google-cloud-bigquery-storage and fastavro/pyarrow " "packages to use the BigQuery Storage API.") import google.api_core.gapic_v1.client_info import pandas client_info = google.api_core.gapic_v1.client_info.ClientInfo( user_agent="pandas-{}".format(pandas.__version__)) return bigquery_storage_v1beta1.BigQueryStorageClient( credentials=credentials, client_info=client_info)
def _read_2nd_connected_count_from_bigquery( self, train_table_name: str, test_table_name: str, read_table_name: str) -> pd.DataFrame: self._logger.info(f"Reading from {read_table_name}") query = """ WITH follow_edges AS ( SELECT engaged_user_id, engaging_user_id FROM ( ( SELECT engaged_user_id, engaging_user_id, MAX(CAST(engagee_follows_engager AS INT64)) AS engagee_follows_engager FROM {} GROUP BY engaged_user_id, engaging_user_id ) UNION ALL ( SELECT engaged_user_id, engaging_user_id, MAX(CAST(engagee_follows_engager AS INT64)) AS engagee_follows_engager FROM {} GROUP BY engaged_user_id, engaging_user_id ) ) GROUP BY engaged_user_id, engaging_user_id HAVING MAX(CAST(engagee_follows_engager AS INT64)) = 1 ) SELECT -- t0.tweet_id, -- t0.engaging_user_id, cast(logical_or(f2.engaging_user_id is not null) as int64) AS engaged_to_engaging_2nd, cast(logical_or(f2.engaging_user_id is not null) or any_value(t0.engagee_follows_engager) as int64) AS engaged_to_engaging_1st_and_2nd, FROM {} as t0 LEFT JOIN follow_edges f1 ON t0.engaged_user_id = f1.engaged_user_id LEFT JOIN follow_edges f2 ON f1.engaging_user_id = f2.engaged_user_id and t0.engaging_user_id = f2.engaging_user_id LEFT JOIN follow_edges f3 ON t0.engaging_user_id = f3.engaged_user_id AND t0.engaged_user_id = f3.engaging_user_id GROUP BY t0.tweet_id, t0.engaging_user_id ORDER BY t0.tweet_id, t0.engaging_user_id """.format(train_table_name, test_table_name, read_table_name) if self.debugging: query += " limit 10000" bqclient = bigquery.Client(project=self.PROJECT_ID) bqstorageclient = bigquery_storage_v1beta1.BigQueryStorageClient() df = (bqclient.query(query).result().to_dataframe( bqstorage_client=bqstorageclient)) return df