예제 #1
0
def extract_data():
    try:
        # Connect to BigQuery API
        bqclient = bigquery.Client()

        bqstorageclient = bigquery_storage_v1beta1.BigQueryStorageClient()
        logging.info('BigQuery successfully authenticated')

    except:
        logging.info('BigQuery authentication unsuccessful')

    # Actual query is here.
    query_string = """
  SELECT
    *
  FROM
    DATABASE.TABLE_NAME
  """
    try:
        # Create a pandas dataframe
        dataframe = (bqclient.query(query_string).result().to_dataframe(
            bqstorage_client=bqstorageclient))

    except:
        logging.log('Could not successfully extract data')

    dataframe.to_csv(today + '_report.csv', index=False)

    logging.log('Query extracted successfully')
예제 #2
0
def check_big_query(ArchivoJson):
    try:
        credentials = service_account.Credentials.from_service_account_file(
            ArchivoJson)
        print("cred")
        scoped_credentials = credentials.with_scopes(
            ['https://www.googleapis.com/auth/cloud-platform'])
        print("scoped_credentials")
        bqclient = bigquery.Client(credentials=scoped_credentials,
                                   project="firm-aviary-195016")
        print("bqclient")
        bqstorageclient = bigquery_storage_v1beta1.BigQueryStorageClient(
            credentials=scoped_credentials)
        print("bqstorageclient")
        query = "SELECT totals.timeOnSite AS Tiempo, FullVisitorID FROM `firm-aviary-195016.81564101.ga_sessions_*`WHERE _TABLE_SUFFIX='20190101'"
        print("query")
        dataframe = (bqclient.query(query).result().to_dataframe(
            bqstorage_client=bqstorageclient))
        print("bigquery: successful")
    except ImportError as error:
        print("Import error: ", error)
    except TransportError as error:
        print("Import error: ", error)
    except:
        print("Unexpected error: ", sys.exc_info()[0])
예제 #3
0
def clients():
    # [START bigquerystorage_pandas_tutorial_all]
    # [START bigquerystorage_pandas_tutorial_create_client]
    import google.auth
    from google.cloud import bigquery
    from google.cloud import bigquery_storage_v1beta1

    # Explicitly create a credentials object. This allows you to use the same
    # credentials for both the BigQuery and BigQuery Storage clients, avoiding
    # unnecessary API calls to fetch duplicate authentication tokens.
    credentials, your_project_id = google.auth.default(
        scopes=["https://www.googleapis.com/auth/cloud-platform"]
    )

    # Make clients.
    bqclient = bigquery.Client(
        credentials=credentials,
        project=your_project_id,
    )
    bqstorageclient = bigquery_storage_v1beta1.BigQueryStorageClient(
        credentials=credentials
    )
    # [END bigquerystorage_pandas_tutorial_create_client]
    # [END bigquerystorage_pandas_tutorial_all]
    return bqclient, bqstorageclient
예제 #4
0
def fetch_data(project):
    bq_client = bigquery.Client(project=project)
    bq_storage_client = bigquery_storage_v1beta1.BigQueryStorageClient()

    IMG_DIR.mkdir(exist_ok=True)

    desktop_data = (bq_client.query(DESKTOP_QUERY).result().to_dataframe(
        bqstorage_client=bq_storage_client))
    desktop_user_state_data = (
        bq_client.query(DESKTOP_USER_STATE_QUERY).result().to_dataframe(
            bqstorage_client=bq_storage_client))

    desktop_data = pd.merge(desktop_data,
                            desktop_user_state_data,
                            on=["date", "country"])
    desktop_data["year"] = pd.DatetimeIndex(desktop_data["date"]).year
    desktop_data["doy"] = pd.DatetimeIndex(desktop_data["date"]).dayofyear
    desktop_data["fakedate"] = [
        pd.to_datetime("20170101") + timedelta(days=x)
        for x in desktop_data["doy"]
    ]
    desktop_data["DAU_MA7d"] = desktop_data.groupby(
        "country")["DAU"].transform(lambda x: x.rolling(window=7).mean())
    desktop_data["MAU_base"] = desktop_data.groupby(["country", "year"
                                                     ]).MAU.transform("first")
    desktop_data["DAU_MA7d_base"] = desktop_data.groupby(
        ["country", "year"]).DAU_MA7d.transform("first")
    desktop_data["dau_pcnt_Jan01"] = (desktop_data["DAU_MA7d"] /
                                      desktop_data["DAU_MA7d_base"])
    desktop_data[
        "mau_pcnt_Jan01"] = desktop_data["MAU"] / desktop_data["MAU_base"]

    return desktop_data
예제 #5
0
def googlebq_patents(label_list, keywords, topn=4):
    import google.auth
    from google.cloud import bigquery
    from google.cloud import bigquery_storage_v1beta1

    credentials, project_id = google.auth.default(
        scopes=["https://www.googleapis.com/auth/cloud-platform"]
    )

    # Make clients.
    bqclient = bigquery.Client(credentials=credentials, project=project_id)
    bqstorageclient = bigquery_storage_v1beta1.BigQueryStorageClient(credentials=credentials)

    query = search_query(label_list, keywords)

    df_search = (
        bqclient.query(query)
        .result()
        .to_dataframe(bqstorage_client=bqstorageclient)
    )
    df_search['patent_date'] = pd.to_numeric(df_search['patent_date'], errors='coerce').astype(np.int32)
    df_search.to_csv('./flaskapp/static/data/search_result.csv')
    key_query = keywords[0:4]

    return key_query, df_search
예제 #6
0
    def _read_features_from_bigquery(self,
                                     read_table_name: str) -> pd.DataFrame:
        self._logger.info(f"Reading from {read_table_name}")
        query = """
                select
                    countif(text_token=119) as n_text_token_119,
                    countif(text_token=188) as n_text_token_188,
                    countif(text_token=11170) as n_text_token_11170,
                    countif(text_token=14120) as n_text_token_14120,
                    countif(text_token=120) as n_text_token_120,
                    countif(text_token=146) as n_text_token_146,
                    countif(text_token=18628) as n_text_token_18628,
                    countif(text_token=10111) as n_text_token_10111,
                    countif(text_token=15221) as n_text_token_15221,
                    countif(text_token=1881) as n_text_token_1881,
                    countif(text_token=11662) as n_text_token_11662,
                from {}
                cross join unnest(text_tokens) as text_token
                group by tweet_id, engaging_user_id
                order by tweet_id, engaging_user_id
        """.format(read_table_name)
        if self.debugging:
            query += " limit 10000"

        bqclient = bigquery.Client(project=self.PROJECT_ID)
        bqstorageclient = bigquery_storage_v1beta1.BigQueryStorageClient()
        df = (bqclient.query(query).result().to_dataframe(
            bqstorage_client=bqstorageclient))
        return df
예제 #7
0
    def _read_features_from_bigquery(self, read_table_name: str,
                                     read_text_name: str) -> pd.DataFrame:
        self._logger.info(f"Reading from {read_table_name}")
        query = """
                WITH text_length AS (
                    SELECT
                        tweet_id,
                        length(regexp_replace(regexp_replace(text, r"https?://[w/:%#$&?()~.=+-…]+", ""), r'(@[a-zA-Z0-9_]{{1,15}})', "")) as removed_html_at
                    FROM {}
                )

                SELECT
                    B.removed_html_at
                FROM {} AS A
                LEFT OUTER JOIN text_length AS B
                ON A.tweet_id = B.tweet_id
                ORDER BY
                A.tweet_id, A.engaging_user_id
        """.format(read_text_name, read_table_name)
        if self.debugging:
            query += " limit 10000"

        bqclient = bigquery.Client(project=self.PROJECT_ID)
        bqstorageclient = bigquery_storage_v1beta1.BigQueryStorageClient()
        df = (bqclient.query(query).result().to_dataframe(
            bqstorage_client=bqstorageclient))
        return df
예제 #8
0
def _make_bqstorage_client(use_bqstorage_api, credentials):
    if not use_bqstorage_api:
        return None

    try:
        from google.cloud import bigquery_storage_v1beta1
    except ImportError as err:
        customized_error = ImportError(
            "Install the google-cloud-bigquery-storage and pyarrow packages "
            "to use the BigQuery Storage API."
        )
        six.raise_from(customized_error, err)

    try:
        from google.api_core.gapic_v1 import client_info as gapic_client_info
    except ImportError as err:
        customized_error = ImportError(
            "Install the grpcio package to use the BigQuery Storage API."
        )
        six.raise_from(customized_error, err)

    return bigquery_storage_v1beta1.BigQueryStorageClient(
        credentials=credentials,
        client_info=gapic_client_info.ClientInfo(user_agent=IPYTHON_USER_AGENT),
    )
예제 #9
0
    def _read_features_from_bigquery(self, read_table_name: str,
                                     read_text_name: str) -> pd.DataFrame:
        self._logger.info(f"Reading from {read_table_name}")
        query = """
                WITH sentence_count AS (
                    SELECT
                        tweet_id,
                        array_length(split(text, "。")) sen_cnt_0,
                        array_length(split(regexp_replace(text, r"https?://[\w!?/\+\-_~=;\.,*&@#$%\(\)\'\[\]]+", ""), ".")) sen_cnt_1,
                    FROM {}
                )
                SELECT
                    B.sen_cnt_0,
                    B.sen_cnt_1,
                FROM {} AS A
                LEFT OUTER JOIN sentence_count AS B
                ON A.tweet_id = B.tweet_id
                ORDER BY
                A.tweet_id, A.engaging_user_id
        """.format(read_text_name, read_table_name)
        if self.debugging:
            query += " limit 10000"

        bqclient = bigquery.Client(project=self.PROJECT_ID)
        bqstorageclient = bigquery_storage_v1beta1.BigQueryStorageClient()
        df = (bqclient.query(query).result().to_dataframe(
            bqstorage_client=bqstorageclient))
        return df
    def _read_features_from_bigquery(self) -> pd.DataFrame:
        query = """
            WITH
            cumulative AS (
              SELECT
                row_id,
                -- leakを防ぐために, 過去レコードから現時点の1つ前のレコードまでを計算範囲とする
                SUM(1) OVER (PARTITION BY user_id, part ORDER BY timestamp ROWS BETWEEN 100 PRECEDING AND 1 PRECEDING) AS cumlative_user_appearance,
                SUM(answered_correctly) OVER (PARTITION BY user_id, part ORDER BY timestamp ROWS BETWEEN 100 PRECEDING AND 1 PRECEDING) AS cumlative_user_corrected_answers,
              FROM
                `wantedly-individual-shu.riiid.train_questions`
            )
            SELECT
              cumlative_user_appearance,
              cumlative_user_corrected_answers,
              cumlative_user_corrected_answers / cumlative_user_appearance AS mean_user_accuracy,
            FROM
              cumulative
        """
        query += " order by row_id"
        if self.debugging:
            query += " limit 10000"
        self._logger.info(f"{query}")

        bqclient = bigquery.Client(project=self.PROJECT_ID)
        bqstorageclient = bigquery_storage_v1beta1.BigQueryStorageClient()
        df = (
            bqclient.query(query)
            .result()
            .to_dataframe(bqstorage_client=bqstorageclient)
        )
        print(df)
        return df
예제 #11
0
    def _read_text_from_bigquery(self, train_text_name: str,
                                 test_text_name: str) -> pd.DataFrame:
        self._logger.info(
            f"Reading from {train_text_name} and {test_text_name}")
        query = """
                (
                SELECT
                    tweet_id,
                    regexp_replace(regexp_replace(text, r"https?://[w/:%#$&?()~.=+-…]+", ""), r'(@[a-zA-Z0-9_]{{1,15}})', "") as text,
                FROM {}
                WHERE language = '22C448FF81263D4BAF2A176145EE9EAD'   -- japanese
                )
                UNION ALL
                (
                SELECT
                    tweet_id,
                    regexp_replace(regexp_replace(text, r"https?://[w/:%#$&?()~.=+-…]+", ""), r'(@[a-zA-Z0-9_]{{1,15}})', "") as text,
                FROM {}
                WHERE language = '22C448FF81263D4BAF2A176145EE9EAD'   -- japanese
                )
        """.format(train_text_name, test_text_name)
        if self.debugging:
            query += " limit 10000"

        bqclient = bigquery.Client(project=self.PROJECT_ID)
        bqstorageclient = bigquery_storage_v1beta1.BigQueryStorageClient()
        df = (bqclient.query(query).result().to_dataframe(
            bqstorage_client=bqstorageclient))
        return df
예제 #12
0
    def _read_features_from_bigquery(self) -> pd.DataFrame:
        query = """
            WITH
            train_only_questions AS (
              SELECT
                row_id,
                prior_question_elapsed_time,
              FROM
                riiid.train
              WHERE
                content_type_id = 0
            )
            SELECT
              prior_question_elapsed_time,
            FROM
              train_only_questions
        """
        query += " order by row_id"
        if self.debugging:
            query += " limit 10000"
        self._logger.info(f"{query}")

        bqclient = bigquery.Client(project=self.PROJECT_ID)
        bqstorageclient = bigquery_storage_v1beta1.BigQueryStorageClient()
        df = (bqclient.query(query).result().to_dataframe(
            bqstorage_client=bqstorageclient))
        return df
예제 #13
0
    def _read_features_from_bigquery(self) -> pd.DataFrame:
        query = """
            WITH
            train_only_questions AS (
              SELECT
                row_id,
                content_id,
                prior_question_had_explanation,
              FROM
                riiid.train
              WHERE
                content_type_id = 0
            )
            SELECT
              questions.part,
              CASE WHEN train_only_questions.prior_question_had_explanation IS TRUE THEN 1 ELSE 0 END AS prior_question_had_explanation,
            FROM
              train_only_questions
            LEFT OUTER JOIN
              riiid.questions
              ON train_only_questions.content_id = questions.question_id
        """
        query += " order by row_id"
        if self.debugging:
            query += " limit 10000"
        self._logger.info(f"{query}")

        bqclient = bigquery.Client(project=self.PROJECT_ID)
        bqstorageclient = bigquery_storage_v1beta1.BigQueryStorageClient()
        df = (bqclient.query(query).result().to_dataframe(
            bqstorage_client=bqstorageclient))
        return df
예제 #14
0
    def __init__(self, stmt, num_streams):
        self.stmt = stmt
        self.num_streams = num_streams
        self.bq_client = bigquery.Client(project=self.PROJECT)

        self.bq_storage_client = bigquery_storage_v1beta1.BigQueryStorageClient(
        )
        self.run()
def kangkang_apply_for_query():
    credentials, your_project_id = google.auth.default(scopes=[
        "https://console.cloud.google.com/bigquery?project=ibg-data&refresh=1&_ga=2.26329012.-885081645.1553697324&_gac=1.48956372.1553697398.CjwKCAjwvuzkBRAhEiwA9E3FUm99SDlI5ulzXqDrkiZTFAN8f1W3SQOxiapECsStEySDGqfwrQn8ChoCmgkQAvD_BwE&pli=1&authuser=1"
    ])
    # Make clients.
    bqclient = bigquery.Client(credentials=credentials,
                               project=your_project_id)
    bqstorageclient = bigquery_storage_v1beta1.BigQueryStorageClient(
        credentials=credentials)
    def _read_features_from_bigquery(self) -> pd.DataFrame:
        query = """
          WITH
          train AS (
            SELECT
              row_id,
              user_id,
              content_id,
              LAG(content_id, 1) OVER(PARTITION BY user_id ORDER BY timestamp) AS prior_content_id,
            FROM
              `wantedly-individual-shu.riiid.train_questions` AS train_questions
          )
          SELECT
            CASE WHEN
              SQRT(POW(user_vec.NMF_0, 2) + POW(user_vec.NMF_1, 2) + POW(user_vec.NMF_2, 2) + POW(user_vec.NMF_3, 2) + POW(user_vec.NMF_4, 2) + POW(user_vec.NMF_5, 2) + POW(user_vec.NMF_6, 2) + POW(user_vec.NMF_7, 2) + POW(user_vec.NMF_8, 2) + POW(user_vec.NMF_9, 2)) = 0
              OR
              SQRT(POW(content_vec.NMF_0, 2) + POW(content_vec.NMF_1, 2) + POW(content_vec.NMF_2, 2) + POW(content_vec.NMF_3, 2) + POW(content_vec.NMF_4, 2) + POW(content_vec.NMF_5, 2) + POW(content_vec.NMF_6, 2) + POW(content_vec.NMF_7, 2) + POW(content_vec.NMF_8, 2) + POW(content_vec.NMF_9, 2)) = 0
              THEN -999
            ELSE
              (
                user_vec.NMF_0 * content_vec.NMF_0 + 
                user_vec.NMF_1 * content_vec.NMF_1 + 
                user_vec.NMF_2 * content_vec.NMF_2 + 
                user_vec.NMF_3 * content_vec.NMF_3 + 
                user_vec.NMF_4 * content_vec.NMF_4 + 
                user_vec.NMF_5 * content_vec.NMF_5 + 
                user_vec.NMF_6 * content_vec.NMF_6 + 
                user_vec.NMF_7 * content_vec.NMF_7 + 
                user_vec.NMF_8 * content_vec.NMF_8 + 
                user_vec.NMF_9 * content_vec.NMF_9
              ) / SQRT(POW(user_vec.NMF_0, 2) + POW(user_vec.NMF_1, 2) + POW(user_vec.NMF_2, 2) + POW(user_vec.NMF_3, 2) + POW(user_vec.NMF_4, 2) + POW(user_vec.NMF_5, 2) + POW(user_vec.NMF_6, 2) + POW(user_vec.NMF_7, 2) + POW(user_vec.NMF_8, 2) + POW(user_vec.NMF_9, 2))
                / SQRT(POW(content_vec.NMF_0, 2) + POW(content_vec.NMF_1, 2) + POW(content_vec.NMF_2, 2) + POW(content_vec.NMF_3, 2) + POW(content_vec.NMF_4, 2) + POW(content_vec.NMF_5, 2) + POW(content_vec.NMF_6, 2) + POW(content_vec.NMF_7, 2) + POW(content_vec.NMF_8, 2) + POW(content_vec.NMF_9, 2))
            END AS similarity,
          FROM
            train AS train
          LEFT OUTER JOIN
            `wantedly-individual-shu.riiid.content_embedding_lda10` AS content_vec
            ON train.content_id = content_vec.content_id
          LEFT OUTER JOIN
            `wantedly-individual-shu.riiid.content_embedding_lda10` AS user_vec
            ON train.prior_content_id = user_vec.content_id
        """
        query += " order by row_id"
        if self.debugging:
            query += " limit 10000"
        self._logger.info(f"{query}")

        bqclient = bigquery.Client(project=self.PROJECT_ID)
        bqstorageclient = bigquery_storage_v1beta1.BigQueryStorageClient()
        df = (
            bqclient.query(query)
            .result()
            .to_dataframe(bqstorage_client=bqstorageclient)
        )
        print(df)
        return df
예제 #17
0
    def _read_inter_table_from_bigquery(self, table_name: str) -> pd.DataFrame:
        self._logger.info(f"Reading from {table_name}")
        query = """
                WITH subset AS (
                    SELECT
                        tweet_id,
                        engaging_user_id,
                        any_value(hashtags) AS hashtags,
                        any_value(reply_engagement_timestamp) AS reply_engagement_timestamp,
                        any_value(retweet_engagement_timestamp) AS retweet_engagement_timestamp,
                        any_value(retweet_with_comment_engagement_timestamp) AS retweet_with_comment_engagement_timestamp,
                        any_value(like_engagement_timestamp) AS like_engagement_timestamp,
                    FROM {}
                    GROUP BY tweet_id, engaging_user_id
                )
                , unnest_subset AS (
                SELECT
                    tweet_id,
                    engaging_user_id,
                    hashtag,
                    reply_engagement_timestamp,
                    retweet_engagement_timestamp,
                    retweet_with_comment_engagement_timestamp,
                    like_engagement_timestamp
                FROM subset,
                unnest(hashtags) AS hashtag
                )
                , use_combination AS (
                SELECT hashtag, engaging_user_id, COUNT(*) AS cnt
                FROM unnest_subset
                GROUP BY hashtag, engaging_user_id
                HAVING COUNT(*) >= 1
                )
                SELECT
                    unnest_subset.tweet_id,
                    unnest_subset.engaging_user_id,
                    unnest_subset.hashtag,
                    CASE WHEN unnest_subset.reply_engagement_timestamp IS NULL THEN 0 ELSE 1 END AS reply_engagement,
                    CASE WHEN unnest_subset.retweet_engagement_timestamp IS NULL THEN 0 ELSE 1 END AS retweet_engagement,
                    CASE WHEN unnest_subset.retweet_with_comment_engagement_timestamp IS NULL THEN 0 ELSE 1 END AS retweet_with_comment_engagement,
                    CASE WHEN unnest_subset.like_engagement_timestamp IS NULL THEN 0 ELSE 1 END AS like_engagement,
                FROM
                    unnest_subset
                INNER JOIN
                    use_combination
                    ON unnest_subset.hashtag = use_combination.hashtag
                    AND unnest_subset.engaging_user_id = use_combination.engaging_user_id
        """.format(table_name)
        if self.debugging:
            query += " limit 10000"

        bqclient = bigquery.Client(project=self.PROJECT_ID)
        bqstorageclient = bigquery_storage_v1beta1.BigQueryStorageClient()
        df = (bqclient.query(query).result().to_dataframe(
            bqstorage_client=bqstorageclient))
        return df
    def _read_features_from_bigquery(self) -> pd.DataFrame:
        query = """
            WITH
            train AS (
                SELECT
                content_id,
                val,
                answered_correctly,
                row_number() over(partition by user_id, content_id order by timestamp) as rank
                FROM
                `wantedly-individual-shu.riiid.train_questions`
            ),
            aggregation_per_content AS (
                SELECT
                content_id,
                CASE
                  WHEN AVG(answered_correctly) = 0 THEN 999
                  ELSE (1 - AVG(answered_correctly)) / AVG(answered_correctly) END AS weight
                FROM
                train
                WHERE
                val = 0   -- use only train
                AND rank = 1   -- 各ユーザの1回目の正解データのみ使う
                GROUP BY
                content_id
            ),
            cumulative AS (
              SELECT
                row_id,
                -- leakを防ぐために, 過去レコードから現時点の1つ前のレコードまでを計算範囲とする
                SUM(aggregation_per_content.weight) OVER (PARTITION BY user_id ORDER BY timestamp ROWS BETWEEN 100 PRECEDING AND 1 PRECEDING) AS cumlative_user_appearance,
                SUM(answered_correctly * aggregation_per_content.weight) OVER (PARTITION BY user_id ORDER BY timestamp ROWS BETWEEN 100 PRECEDING AND 1 PRECEDING) AS cumlative_user_corrected_answers,
              FROM
                `wantedly-individual-shu.riiid.train_questions` AS train_questions
              LEFT OUTER JOIN
                  aggregation_per_content
                  ON train_questions.content_id = aggregation_per_content.content_id
            )
            SELECT
              cumlative_user_appearance,
              cumlative_user_corrected_answers,
              cumlative_user_corrected_answers / cumlative_user_appearance AS mean_user_accuracy,
            FROM
              cumulative
        """
        query += " order by row_id"
        if self.debugging:
            query += " limit 10000"
        self._logger.info(f"{query}")

        bqclient = bigquery.Client(project=self.PROJECT_ID)
        bqstorageclient = bigquery_storage_v1beta1.BigQueryStorageClient()
        df = (bqclient.query(query).result().to_dataframe(
            bqstorage_client=bqstorageclient))
        print(df)
        return df
    def _read_features_from_bigquery(self, train_table_name: str,
                                     test_table_name: str,
                                     read_table_name: str) -> pd.DataFrame:
        self._logger.info(f"Reading from {read_table_name}")
        query = """
                WITH subset AS (
                SELECT
                    engaged_user_id,
                    engaging_user_id,
                    count(1) as n_pair,
                FROM (
                    (
                        SELECT engaged_user_id, engaging_user_id
                        FROM {}
                    )
                    UNION ALL
                    (
                        SELECT engaged_user_id, engaging_user_id
                        FROM {}
                    )
                )
                GROUP BY
                    engaged_user_id, engaging_user_id
                ),
                agg AS (
                    select
                      org.engaged_user_id,
                      org.engaging_user_id,
                      IFNULL(org.n_pair + rev.n_pair, org.n_pair) AS n_reciprocal_pair,
                    FROM
                      subset AS org
                    LEFT JOIN
                      subset AS rev
                    ON
                      org.engaged_user_id = rev.engaging_user_id
                      AND org.engaging_user_id = rev.engaged_user_id
                )
                SELECT
                -- A.engaged_user_id,
                -- A.engaging_user_id,
                B.n_reciprocal_pair,
                FROM {} AS A
                LEFT OUTER JOIN agg AS B
                ON A.engaged_user_id = B.engaged_user_id AND A.engaging_user_id = B.engaging_user_id
                ORDER BY A.tweet_id, B.engaging_user_id
        """.format(train_table_name, test_table_name, read_table_name)

        if self.debugging:
            query += " limit 10000"

        bqclient = bigquery.Client(project=self.PROJECT_ID)
        bqstorageclient = bigquery_storage_v1beta1.BigQueryStorageClient()
        df = (bqclient.query(query).result().to_dataframe(
            bqstorage_client=bqstorageclient))
        return df
예제 #20
0
    def _read_hashtags_count_from_bigquery(
            self, train_table_name: str, test_table_name) -> pd.DataFrame:
        self._logger.info(f"Reading from {train_table_name} and {test_table_name}")
        query = """
                WITH subset AS (
                  (
                    SELECT tweet_id, any_value(hashtags) AS hashtags
                    FROM {}
                    GROUP BY tweet_id
                  )
                UNION ALL
                  (
                    SELECT tweet_id, any_value(hashtags) AS hashtags
                    FROM {}
                    GROUP BY tweet_id
                  )
                )
                , unnest_subset AS (
                SELECT tweet_id, hashtag
                FROM subset,
                unnest(hashtags) AS hashtag
                )
                , count_hashtag AS (
                SELECT hashtag, COUNT(*) AS cnt
                FROM unnest_subset
                GROUP BY hashtag
                )

                SELECT
                tweet_id,
                AVG(cnt) AS mean_value,
                min(cnt) AS min_value,
                max(cnt) AS max_value,
                case when stddev(cnt) is null then 1 else stddev(cnt) end AS std_value
                FROM (
                    SELECT A.tweet_id, A.hashtag, B.cnt
                    FROM unnest_subset AS A
                    LEFT OUTER JOIN count_hashtag AS B
                    ON A.hashtag = B.hashtag
                )
                GROUP BY
                tweet_id
        """.format(train_table_name, test_table_name)
        if self.debugging:
            query += " limit 10000"

        bqclient = bigquery.Client(project=self.PROJECT_ID)
        bqstorageclient = bigquery_storage_v1beta1.BigQueryStorageClient()
        df = (
            bqclient.query(query)
            .result()
            .to_dataframe(bqstorage_client=bqstorageclient)
        )
        return df
예제 #21
0
def _make_bqstorage_client(use_bqstorage_api, credentials):
    if not use_bqstorage_api:
        return None

    if bigquery_storage_v1beta1 is None:
        raise ImportError(
            "Install the google-cloud-bigquery-storage and fastavro packages "
            "to use the BigQuery Storage API.")

    return bigquery_storage_v1beta1.BigQueryStorageClient(
        credentials=credentials)
예제 #22
0
def get_bigstorageclient():
    """
    Create Big query Storage client utilizaing variable GOOGLE_APPLLICATION_CREDENTIALS
    :return: Big Query Storage Client
    """
    os.environ['GOOGLE_APPLICATION_CREDENTIALS'] = Variable.get(
        'GOOGLE_APPLICATION_CREDENTIALS')
    credentials, my_project_id = google.auth.default(
        scopes=['https://www.googleapis.com/auth/cloud-platform'])
    bqstorageclient = bigquery_storage_v1beta1.BigQueryStorageClient(
        credentials=credentials)
    return bqstorageclient
예제 #23
0
    def _read_count_tweet_type_count_from_bigquery(
            self, train_table_name: str, test_table_name: str,
            read_table_name: str) -> pd.DataFrame:
        self._logger.info(f"Reading from {read_table_name}")
        query = """
                WITH subset AS (
                  (
                    SELECT tweet_id, any_value(engaged_user_id) as engaged_user_id, any_value(tweet_type) as tweet_type
                    FROM {}
                    GROUP BY tweet_id
                  )
                UNION ALL
                  (
                    SELECT tweet_id, any_value(engaged_user_id) as engaged_user_id, any_value(tweet_type) as tweet_type
                    FROM {}
                    GROUP BY tweet_id
                  )
                )
                , count_tweet_type AS (
                SELECT
                    engaged_user_id,
                    countif(tweet_type = "Quote") as n_quote_tweet,
                    countif(tweet_type = "Retweet") as n_retweet_tweet,
                    countif(tweet_type = "Quote") + countif(tweet_type = "Retweet") as n_quote_plus_retweet,
                    countif(tweet_type = "Quote") / count(*) as ratio_quote_tweet,
                    countif(tweet_type = "Retweet") / count(*) as ratio_retweet_tweet,
                    (countif(tweet_type = "Quote") + countif(tweet_type = "Retweet")) / count(*) as ratio_quote_plus_retweet
                FROM subset
                GROUP BY engaged_user_id
                )

                SELECT
                -- t0.engaged_user_id,
                -- t0.engaging_user_id,
                f1.n_quote_tweet,
                f1.n_retweet_tweet,
                f1.n_quote_plus_retweet,
                f1.ratio_quote_tweet,
                f1.ratio_retweet_tweet,
                f1.ratio_quote_plus_retweet,
                FROM {} AS t0
                LEFT OUTER JOIN count_tweet_type AS f1
                ON t0.engaging_user_id = f1.engaged_user_id
                ORDER BY t0.tweet_id, t0.engaging_user_id
        """.format(train_table_name, test_table_name, read_table_name)
        if self.debugging:
            query += " limit 10000"

        bqclient = bigquery.Client(project=self.PROJECT_ID)
        bqstorageclient = bigquery_storage_v1beta1.BigQueryStorageClient()
        df = (bqclient.query(query).result().to_dataframe(
            bqstorage_client=bqstorageclient))
        return df
예제 #24
0
    def _read_features_from_bigquery(self, read_table_name: str) -> pd.DataFrame:
        self._logger.info(f"Reading from {read_table_name}")
        query = """
                WITH
                response_times AS (
                SELECT
                    tweet_id,
                    engaging_user_id,
                    CASE WHEN n_engagement = 0 THEN NULL ELSE (
                    IF(like_response_time IS NOT NULL, like_response_time, 0) + 
                    IF(reply_response_time IS NOT NULL, reply_response_time, 0) + 
                    IF(retweet_response_time IS NOT NULL, retweet_response_time, 0) + 
                    IF(retweet_with_comment_response_time IS NOT NULL, retweet_with_comment_response_time, 0)
                    ) / n_engagement END AS avg_response_time
                FROM (
                    SELECT
                    tweet_id,
                    engaging_user_id,
                    TIMESTAMP_DIFF(TIMESTAMP_SECONDS(like_engagement_timestamp), TIMESTAMP_SECONDS(timestamp), MINUTE) AS like_response_time,
                    TIMESTAMP_DIFF(TIMESTAMP_SECONDS(reply_engagement_timestamp), TIMESTAMP_SECONDS(timestamp), MINUTE) AS reply_response_time,
                    TIMESTAMP_DIFF(TIMESTAMP_SECONDS(retweet_engagement_timestamp), TIMESTAMP_SECONDS(timestamp), MINUTE) AS retweet_response_time,
                    TIMESTAMP_DIFF(TIMESTAMP_SECONDS(retweet_with_comment_engagement_timestamp), TIMESTAMP_SECONDS(timestamp), MINUTE) AS retweet_with_comment_response_time,
                    IF(like_engagement_timestamp IS NOT NULL, 1, 0) + 
                    IF(reply_engagement_timestamp IS NOT NULL, 1, 0) + 
                    IF(retweet_engagement_timestamp IS NOT NULL, 1, 0) + 
                    IF(retweet_with_comment_engagement_timestamp IS NOT NULL, 1, 0) AS n_engagement
                    FROM
                    {}
                  )
                )

                SELECT
                  tweet_id,
                  engaging_user_id,
                  avg_response_time
                FROM
                  response_times
                ORDER BY
                  tweet_id,
                  engaging_user_id
        """.format(read_table_name)
        if self.debugging:
            query += " limit 10000"

        bqclient = bigquery.Client(project=self.PROJECT_ID)
        bqstorageclient = bigquery_storage_v1beta1.BigQueryStorageClient()
        df = (
            bqclient.query(query)
            .result()
            .to_dataframe(bqstorage_client=bqstorageclient)
        )
        return df
예제 #25
0
파일: gcp.py 프로젝트: klahrich/easycloud
 def __init__(
         self,
         timezone='US/Eastern',
         service_account_path=os.environ['GOOGLE_APPLICATION_CREDENTIALS']):
     self.timezone = timezone
     credentials = service_account.Credentials.from_service_account_file(
         service_account_path)
     self.project = credentials.project_id
     self.client = bgq.Client(credentials=credentials,
                              project=credentials.project_id)
     self.bqstorage_client = bigquery_storage_v1beta1.BigQueryStorageClient(
         credentials=credentials)
     self.storage_client = storage.Client(credentials=credentials,
                                          project=credentials.project_id)
예제 #26
0
    def _read_activity_time_from_bigquery(self, train_table_name: str,
                                          test_table_name) -> pd.DataFrame:
        self._logger.info(
            f"Reading from {train_table_name} and {test_table_name}")
        query = """
                WITH subset AS (
                  (
                    SELECT tweet_id, any_value(language) AS language, any_value(Timestamp) AS Timestamp
                    FROM {}
                    GROUP BY tweet_id
                  )
                UNION ALL
                  (
                    SELECT tweet_id, any_value(language) AS language, any_value(Timestamp) AS Timestamp
                    FROM {}
                    GROUP BY tweet_id
                  )
                ), agg AS (
                SELECT language, hour, count(*) AS cnt
                FROM (
                    SELECT
                    tweet_id,
                    language,
                    EXTRACT(HOUR FROM TIMESTAMP_SECONDS(Timestamp)) AS hour
                    FROM
                    subset
                )
                GROUP BY language, hour
                )
                SELECT
                A.language,
                A.hour,
                A.cnt / B.total_cnt AS activity_ratio
                FROM agg AS A
                INNER JOIN (
                SELECT language, sum(cnt) as total_cnt
                FROM agg
                GROUP BY language
                ) AS B
                ON A.language = B.language
        """.format(train_table_name, test_table_name)
        if self.debugging:
            query += " limit 10000"

        bqclient = bigquery.Client(project=self.PROJECT_ID)
        bqstorageclient = bigquery_storage_v1beta1.BigQueryStorageClient()
        df = (bqclient.query(query).result().to_dataframe(
            bqstorage_client=bqstorageclient))
        return df
예제 #27
0
    def __init__(self, **kwargs):

        self.config_dict = self.override_defaults(**kwargs)
        os.environ["GOOGLE_APPLICATION_CREDENTIALS"] = self.config_dict[
            "google_application_credentials"]
        os.environ["GCLOUD_PROJECT"] = self.config_dict["gcloud_project"]

        # https://cloud.google.com/bigquery/docs/bigquery-storage-python-pandas
        credentials, your_project_id = google.auth.default(
            scopes=["https://www.googleapis.com/auth/cloud-platform"])

        self.client = bigquery.Client(credentials=credentials,
                                      project=your_project_id)
        self.bqstorageclient = bigquery_storage_v1beta1.BigQueryStorageClient(
            credentials=credentials)
예제 #28
0
    def _read_from_bigquery(self, table_name: str) -> pd.DataFrame:
        self._logger.info(f"Reading from {table_name}")
        query = """
            select {}
            from {}
            order by tweet_id, engaging_user_id
        """.format(", ".join(self.import_columns()), table_name)
        if self.debugging:
            query += " limit 10000"

        bqclient = bigquery.Client(project=PROJECT_ID)
        bqstorageclient = bigquery_storage_v1beta1.BigQueryStorageClient()
        df = (bqclient.query(query).result().to_dataframe(
            bqstorage_client=bqstorageclient))
        return df
예제 #29
0
def _make_bqstorage_client(use_bqstorage_api, credentials):
    if not use_bqstorage_api:
        return None

    if bigquery_storage_v1beta1 is None:
        raise ImportError(
            "Install the google-cloud-bigquery-storage and fastavro/pyarrow "
            "packages to use the BigQuery Storage API.")

    import google.api_core.gapic_v1.client_info
    import pandas

    client_info = google.api_core.gapic_v1.client_info.ClientInfo(
        user_agent="pandas-{}".format(pandas.__version__))
    return bigquery_storage_v1beta1.BigQueryStorageClient(
        credentials=credentials, client_info=client_info)
    def _read_2nd_connected_count_from_bigquery(
            self, train_table_name: str, test_table_name: str,
            read_table_name: str) -> pd.DataFrame:
        self._logger.info(f"Reading from {read_table_name}")
        query = """
                WITH follow_edges AS (
                SELECT
                    engaged_user_id, engaging_user_id
                FROM (
                    (
                        SELECT engaged_user_id, engaging_user_id, MAX(CAST(engagee_follows_engager AS INT64)) AS engagee_follows_engager
                        FROM {}
                        GROUP BY engaged_user_id, engaging_user_id
                    )
                    UNION ALL
                    (
                        SELECT engaged_user_id, engaging_user_id, MAX(CAST(engagee_follows_engager AS INT64)) AS engagee_follows_engager
                        FROM {}
                        GROUP BY engaged_user_id, engaging_user_id
                    )
                )
                GROUP BY
                    engaged_user_id, engaging_user_id
                HAVING
                    MAX(CAST(engagee_follows_engager AS INT64)) = 1
                )

                SELECT
                -- t0.tweet_id,
                -- t0.engaging_user_id,
                cast(logical_or(f2.engaging_user_id is not null) as int64) AS engaged_to_engaging_2nd,
                cast(logical_or(f2.engaging_user_id is not null) or any_value(t0.engagee_follows_engager) as int64) AS engaged_to_engaging_1st_and_2nd,
                FROM {} as t0
                LEFT JOIN follow_edges f1 ON t0.engaged_user_id = f1.engaged_user_id
                LEFT JOIN follow_edges f2 ON f1.engaging_user_id = f2.engaged_user_id and t0.engaging_user_id = f2.engaging_user_id
                LEFT JOIN follow_edges f3 ON t0.engaging_user_id = f3.engaged_user_id AND t0.engaged_user_id = f3.engaging_user_id
                GROUP BY t0.tweet_id, t0.engaging_user_id
                ORDER BY t0.tweet_id, t0.engaging_user_id
        """.format(train_table_name, test_table_name, read_table_name)
        if self.debugging:
            query += " limit 10000"

        bqclient = bigquery.Client(project=self.PROJECT_ID)
        bqstorageclient = bigquery_storage_v1beta1.BigQueryStorageClient()
        df = (bqclient.query(query).result().to_dataframe(
            bqstorage_client=bqstorageclient))
        return df