def create_members_table(apps, schema_editor):
    dataset_ref = client.dataset(settings.BIGQUERY_DATASET_ID)

    # Only define fields that are required and not strings (because
    # non-required strings will be populated automatically)
    schema = [
        bigquery.SchemaField('date', 'DATE', mode='REQUIRED'),
        bigquery.SchemaField('id', 'INT64', mode='REQUIRED'),
        bigquery.SchemaField('joined', 'TIMESTAMP'),
        bigquery.SchemaField('lat', 'FLOAT64'),
        bigquery.SchemaField('lon', 'FLOAT64'),
        bigquery.SchemaField('group_id', 'INT64', mode='REQUIRED'),
        bigquery.SchemaField('group_urlname', 'STRING', mode='REQUIRED'),
        bigquery.SchemaField('is_pro_admin', 'BOOL'),
    ]
    table_ref = dataset_ref.table('members')
    table = bigquery.Table(table_ref, schema=schema)
    table.time_partitioning = bigquery.TimePartitioning(
        type_=bigquery.TimePartitioningType.DAY, field='date')

    # define partitioning by date
    table.time_partitioning = bigquery.TimePartitioning(
        type_=bigquery.TimePartitioningType.DAY, field='date')

    # define clustering by group urlname
    table.clustering_fields = ['group_urlname']

    client.create_table(table)
示例#2
0
    def load_data_to_bq(self):
        """
        loads or initalizes the job in BQ
        """
        # for newest data
        bigquery_client = bigquery.Client(CONFIG['project'])
        # print(bigquery_client)
        destination_dataset_ref = bigquery_client.dataset(CONFIG['dataset'])
        destination_table_ref = destination_dataset_ref.table(
            self.table_destination + '$' + self.date_nodash)
        job_config = bigquery.LoadJobConfig()
        job_config.create_disposition = bigquery.CreateDisposition.CREATE_IF_NEEDED
        job_config.write_disposition = bigquery.WriteDisposition.WRITE_TRUNCATE
        job_config.source_format = bigquery.SourceFormat.NEWLINE_DELIMITED_JSON
        #using partition by ingestion Time
        job_config.time_partitioning = bigquery.TimePartitioning(
            type_=bigquery.TimePartitioningType.DAY)

        with open(self.path, 'rb') as f:
            job = bigquery_client.load_table_from_file(f,
                                                       destination_table_ref,
                                                       job_config=job_config)
            job.result()
            print('----->>>> ' + self.path + ' has success to load!')
            os.remove(self.path)
示例#3
0
def dest_partitioned_table(request, bq: bigquery.Client, mock_env,
                           dest_dataset) -> bigquery.Table:
    public_table: bigquery.Table = bq.get_table(
        bigquery.TableReference.from_string(
            "bigquery-public-data.new_york_311.311_service_requests"))
    schema = public_table.schema

    table: bigquery.Table = bigquery.Table(
        f"{os.environ.get('GCP_PROJECT')}"
        f".{dest_dataset.dataset_id}.cf_test_nyc_311_"
        f"{str(uuid.uuid4()).replace('-','_')}",
        schema=schema,
    )

    table.time_partitioning = bigquery.TimePartitioning()
    table.time_partitioning.type_ = bigquery.TimePartitioningType.HOUR
    table.time_partitioning.field = "created_date"

    table = bq.create_table(table)

    def teardown():
        bq.delete_table(table, not_found_ok=True)

    request.addfinalizer(teardown)
    return table
示例#4
0
def _run_project_sql(bq_client, project, dataset, submission_date, slug):
    normalized_slug = _bq_normalize_name(slug)
    date_partition = str(submission_date).replace("-", "")

    query_config = bigquery.QueryJobConfig(
        query_parameters=[
            bigquery.ScalarQueryParameter("submission_date", "DATE",
                                          str(submission_date)),
        ],
        use_legacy_sql=False,
        clustering_fields=["build_id"],
        default_dataset=f"{project}.{dataset}",
        time_partitioning=bigquery.TimePartitioning(field="submission_date"),
        use_query_cache=True,
        allow_large_results=True,
    )

    for data_type in DATA_TYPES:
        destination_table = (
            f"{project}.{dataset}.{normalized_slug}_{data_type}${date_partition}"
        )
        query_config.destination = destination_table
        query_config.write_disposition = "WRITE_TRUNCATE"

        _run_sql_for_data_type(bq_client, project, dataset, normalized_slug,
                               query_config, data_type)
示例#5
0
 def bq_create_table(self,
                     table_ref,
                     columns=[],
                     data_types=[],
                     mode=[],
                     partition=None):
     if len(columns) == 0 or len(data_types) == 0:
         print("columns or data type must be filled")
         return
     if len(columns) != len(data_types):
         print("columns and data type must be same total array")
         return
     bigquery_client = bigquery.Client()
     # Prepares a reference to the table
     try:
         bigquery_client.get_table(table_ref)
     except Exception as e:
         schema = []
         for i in range(0, len(data_types)):
             schema.append(
                 bigquery.SchemaField(columns[i], data_types[i], mode[i]))
         table = bigquery.Table(table_ref, schema=schema)
         if partition is not None:
             table.time_partitioning = bigquery.TimePartitioning(
                 type_=bigquery.TimePartitioningType.DAY,
                 field="process_time",
             )
         table = bigquery_client.create_table(table)
         return_name = "Created table {}".format(table.table_id)
         if partition is not None:
             return_name = "{}, partitioned on column{}".format(
                 return_name, table.time_partitioning)
         print(return_name)
示例#6
0
    def create_table(self):
        """
        Method to create a table in the dataset with schema and data from a csv.
        """
        # creating a reference for the dataset in which the table is present
        dataset_ref = self.client.dataset('dataset_airflow_etl')

        # creating a reference to the table
        table_ref = dataset_ref.table('airflow_table_assignment_divya')

        try:
            # fetching details of the table from the reference
            self.client.get_table(table_ref)
        except NotFound:
            # table_ref = dataset_ref.table(table_ref)
            schema = [
                bigquery.SchemaField("state", "STRING", mode="REQUIRED"),
                bigquery.SchemaField("count", "INTEGER", mode="REQUIRED"),
                bigquery.SchemaField("date", "DATE", mode="REQUIRED"),
            ]
            table = bigquery.Table(table_ref, schema=schema)
            table.time_partitioning = bigquery.TimePartitioning(
                type_=bigquery.TimePartitioningType.DAY,
                field="date",  # name of column to use for partitioning
                expiration_ms=5184000000,
            )  # 90 days
            print("Created table {}, partitioned on column {}".format(
                table.table_id, table.time_partitioning.field))

            table = self.client.create_table(table)
示例#7
0
    def run_query(self, date, qstring=None):
        if qstring is None:
            qstring = read_string("sql/{}.sql".format(self.config["query"]))
            qparams = self.get_query_params(date)
            qstring = qstring.format(**qparams)
        table_ref = self.client.dataset(
            self.config["params"]["dataset"]).table(
                self.config["params"]["dest"])
        job_config = bigquery.QueryJobConfig()
        job_config.write_disposition = (
            bigquery.WriteDisposition.WRITE_APPEND if self.is_write_append()
            else bigquery.WriteDisposition.WRITE_TRUNCATE)
        if ("allow_field_addition" in self.config
                and self.config["allow_field_addition"]):
            job_config.schema_update_options = [
                bigquery.SchemaUpdateOption.ALLOW_FIELD_ADDITION
            ]
        job_config.destination = table_ref
        if "partition_field" in self.config:
            job_config.time_partitioning = bigquery.TimePartitioning(
                type_=bigquery.TimePartitioningType.DAY,
                field=self.config["partition_field"],
            )

        query = self.client.query(qstring, job_config=job_config)
        query.result()

        if "create_view_alt" in self.config and self.config["create_view_alt"]:
            self.create_view("_view")
def create_groups_table(apps, schema_editor):
    dataset_ref = client.dataset(settings.BIGQUERY_DATASET_ID)
    # Only define fields that are required and not strings (because
    # non-required strings will be populated automatically)
    schema = [
        bigquery.SchemaField('date', 'DATE', mode='REQUIRED'),
        bigquery.SchemaField('id', 'INT64', mode='REQUIRED'),
        bigquery.SchemaField('urlname', 'STRING', mode='REQUIRED'),
        bigquery.SchemaField('created', 'TIMESTAMP'),
        bigquery.SchemaField('lat', 'FLOAT64'),
        bigquery.SchemaField('lon', 'FLOAT64'),
        bigquery.SchemaField('members', 'INT64'),
        bigquery.SchemaField('organizer_id', 'INT64'),
        bigquery.SchemaField('next_event_yes_rsvp_count', 'INT64'),
        bigquery.SchemaField('next_event_time', 'TIMESTAMP'),
        bigquery.SchemaField('category_id', 'INT64'),
        bigquery.SchemaField('meta_category_id', 'INT64'),
    ]
    table_ref = dataset_ref.table('groups')
    table = bigquery.Table(table_ref, schema=schema)

    # define partitioning by date
    table.time_partitioning = bigquery.TimePartitioning(
        type_=bigquery.TimePartitioningType.DAY, field='date')

    # define clustering by urlname
    table.clustering_fields = ['urlname']

    client.create_table(table)
示例#9
0
def load_new_telemetry_snippet_data(dataset_id, table_name, next_load_date, end_load_date):
    '''
    Reads csv file from google cloud storage bucket and loads it into bigquery
    :param dataset_id: Name of dataset to be loaded into
    :param table_name: Name of table to be loaded into
    :param next_load_date: Earliest date to be loaded into table_name
    :param end_load_date: Latest date to be loaded into table_name
    :return:
    '''
    while next_load_date < end_load_date:
        # Set dates required for loading new data
        next_load_date = datetime.strftime(next_load_date, '%Y%m%d')
        logging.info(f'snippetTelemetryDailyRetrieve: Starting load for next load date: {next_load_date}')
        client = bigquery.Client(project='ga-mozilla-org-prod-001')
        file = f'gs://snippets-data-transfer/daily-tracking-data/snippets_{next_load_date}.csv'
        load_dataset_id = dataset_id
        load_table_name = table_name
        load_table_suffix = next_load_date
        load_table_id = f'{load_table_name.lower()}_{load_table_suffix}'

        # Configure load job
        dataset_ref = client.dataset(load_dataset_id)
        table_ref = dataset_ref.table(load_table_id)
        load_job_config = bigquery.LoadJobConfig()  # load job call
        load_job_config.schema = [
            bigquery.SchemaField('sendDate', 'DATE'),
            bigquery.SchemaField('messageID', 'STRING'),
            bigquery.SchemaField('releaseChannel', 'STRING'),
            bigquery.SchemaField('locale', 'STRING'),
            bigquery.SchemaField('countryCode', 'STRING'),
            bigquery.SchemaField('os', 'STRING'),
            bigquery.SchemaField('version', 'STRING'),
            bigquery.SchemaField('impressions', 'INTEGER'),
            bigquery.SchemaField('clicks', 'INTEGER'),
            bigquery.SchemaField('blocks', 'INTEGER')
        ] # Define schema
        load_job_config.source_format = bigquery.SourceFormat.CSV
        load_job_config.skip_leading_rows = 1
        load_job_config.max_bad_records = 20
        load_job_config.time_partitioning = bigquery.TimePartitioning(
            type_=bigquery.TimePartitioningType.DAY,
            field='sendDate',
        )
        load_job_config.write_disposition = 'WRITE_APPEND'  # Options are WRITE_TRUNCATE, WRITE_APPEND, WRITE_EMPTY

        load_job = client.load_table_from_uri(
                file,
                table_ref,
                location='US',
                job_config=load_job_config
            )

        assert load_job.job_type == 'load'

        load_job.result() # Waits for the query to finish
        logging.info(f'snippetTelemetryDailyRetrieve: File {file} loaded to table {table_ref.path}')

        # Set next_load_date
        next_load_date = datetime.strptime(next_load_date, '%Y%m%d') + timedelta(1)
    return
示例#10
0
def create_table(client, table):
    schema = client.schema_from_json(get_schema_file_object())
    table = bigquery.table.Table(table, schema)
    table.time_partitioning = bigquery.TimePartitioning(
        type_=bigquery.TimePartitioningType.DAY, field="created_at"
    )
    return client.create_table(table)
示例#11
0
 def get_table_ref(self,
                   name,
                   project,
                   dataset,
                   partition=None,
                   schema=None,
                   clustering_fields=None):
     tbb = bigquery.Table("{}.{}.{}".format(project, dataset.dataset_id,
                                            name),
                          schema=schema)
     if clustering_fields is not None and schema is not None:
         clustering_fields = list(
             map(lambda x: x.split(':')[0], clustering_fields))
         tbb.clustering_fields = clustering_fields
     if partition is not None:
         if partition['type'] == 'time':
             tbb.time_partitioning = bigquery.TimePartitioning(
                 type_=bigquery.TimePartitioningType.DAY,
                 require_partition_filter=False)
         elif partition['type'] == 'range':
             tbb.range_partitioning = bigquery.RangePartitioning(
                 bigquery.PartitionRange(partition['start'],
                                         partition['end'],
                                         partition['interval']),
                 partition['field'])
     return tbb
示例#12
0
def dest_partitioned_table_allow_jagged(bq: bigquery.Client, dest_dataset,
                                        monkeypatch) -> bigquery.Table:
    public_table: bigquery.Table = bq.get_table(
        bigquery.TableReference.from_string(
            "bigquery-public-data.new_york_311.311_service_requests"))
    schema = public_table.schema

    if os.getenv('GCP_PROJECT') is None:
        monkeypatch.setenv("GCP_PROJECT", bq.project)

    extra_field_for_jagged_row_test = bigquery.schema.SchemaField(
        "extra_jagged_row_test_column", "STRING")
    schema.append(extra_field_for_jagged_row_test)
    table: bigquery.Table = bigquery.Table(
        f"{os.getenv('GCP_PROJECT')}"
        f".{dest_dataset.dataset_id}.cf_test_nyc_311_"
        f"{str(uuid.uuid4()).replace('-', '_')}",
        schema=schema,
    )

    table.time_partitioning = bigquery.TimePartitioning()
    table.time_partitioning.type_ = bigquery.TimePartitioningType.HOUR
    table.time_partitioning.field = "created_date"

    table = bq.create_table(table)
    return table
示例#13
0
 def __init__(
         self,
         project_id,
         credentials,
         table_name,
         table_schema,
         csv_path='csv'
 ):
     self.project_id = project_id
     self.credentials = credentials
     self.table_name = table_name
     self.table_schema = table_schema
     self.csv_path = csv_path
     # 1 years of data should always be ok
     self.default_table_expiration = 24 * 60 * 60 * 365
     self.date_col_partitioning = bigquery.TimePartitioning(
         type_=bigquery.TimePartitioningType.DAY,
         field="date",
         expiration_ms=self.default_table_expiration * 1000,
     )
     self.uploader = BigQueryUploader(
         project_id=self.project_id,
         dataset_id=os.getenv("BIGQUERY_DATASET"),
         tmp_folder=self.csv_path,
         credentials=self.credentials
     )
def client_load_partitioned_table(client, table_id):

    # [START bigquery_load_table_partitioned]
    from google.cloud import bigquery

    # TODO(developer): Construct a BigQuery client object.
    # client = bigquery.Client()

    # TODO(developer): Set table_id to the ID of the table to create.
    # table_id = "your-project.your_dataset.your_table_name"

    job_config = bigquery.LoadJobConfig(
        schema=[
            bigquery.SchemaField("name", "STRING"),
            bigquery.SchemaField("post_abbr", "STRING"),
            bigquery.SchemaField("date", "DATE"),
        ],
        skip_leading_rows=1,
        time_partitioning=bigquery.TimePartitioning(
            type_=bigquery.TimePartitioningType.DAY,
            field="date",  # Name of the column to use for partitioning.
            expiration_ms=7776000000,  # 90 days.
        ),
    )
    uri = "gs://cloud-samples-data/bigquery/us-states/us-states-by-date.csv"

    load_job = client.load_table_from_uri(
        uri, table_id, job_config=job_config
    )  # Make an API request.

    load_job.result()  # Wait for the job to complete.

    table = client.get_table(table_id)
    print("Loaded {} rows to table {}".format(table.num_rows, table_id))
示例#15
0
def load_csv_to_bq(project_name, dataset_name, table_name, uri):
    client = bigquery.Client()
    table = client.get_table(project_name + '.' + dataset_name + '.' + table_name)

    if 'detailed' in table_name:
        job_config = bigquery.LoadJobConfig(
            schema=[bigquery.SchemaField("brand", "STRING"),
                    bigquery.SchemaField("headline", "STRING"),
                    bigquery.SchemaField("review_date", "DATE"),
                    bigquery.SchemaField("review_content", "STRING"),
                    bigquery.SchemaField("review_votes", "INT64"),
                    bigquery.SchemaField("review_stars", "INT64"),
                    bigquery.SchemaField("ingestion_date", "DATETIME")],
            skip_leading_rows = 1,
            field_delimiter = '|',
            time_partitioning = bigquery.TimePartitioning(
                type_ = bigquery.TimePartitioningType.DAY,
                field = 'ingestion_date'))
    else:
        job_config = bigquery.LoadJobConfig(
            schema= table.schema,
            skip_leading_rows = 1,
            field_delimiter = '|')

    load_job = client.load_table_from_uri(uri, table, job_config = job_config)

    try:
        load_job.result()
        print("Loaded {} rows to {}.{}".format(load_job.output_rows, dataset_name, table_name))
    except BadRequest as e:
        for e in load_job.errors:
            print('ERROR: {}'.format(e['message']))
示例#16
0
def execute_transformation_query(bq_client):
    """Executes transformation query to a new destination table.
    Args:
        bq_client: Object representing a reference to a BigQuery Client
    """
    dataset_ref = bq_client.get_dataset(bigquery.DatasetReference(
        project=config.billing_project_id,
        dataset_id=config.output_dataset_id))
    table_ref = dataset_ref.table(config.output_table_name)
    job_config = bigquery.QueryJobConfig()
    job_config.destination = table_ref
    job_config.write_disposition = bigquery.WriteDisposition().WRITE_TRUNCATE
    job_config.time_partitioning = bigquery.TimePartitioning(
        field='usage_start_time',
        expiration_ms=None)
    sql = Template(file_to_string(config.sql_file_path))
    sql = sql.safe_substitute(billing_table=config.billing_project_id +
                                            '.' + config.billing_dataset_id +
                                            '.' + config.billing_table_name,
                              allocation_method=config.allocation_method
                              )
    logging.info('Attempting query on all dates...')
    # Execute Query
    query_job = bq_client.query(
        sql,
        job_config=job_config)

    query_job.result()  # Waits for the query to finish
    logging.info('Transformation query complete. All partitions are updated.')
示例#17
0
def create_functional_alltypes_parted_table(bqclient,
                                            functional_alltypes_parted_table):
    table = bigquery.Table(functional_alltypes_parted_table)
    table.schema = [
        bigquery.SchemaField("index", "INTEGER"),
        bigquery.SchemaField("Unnamed_0", "INTEGER"),
        bigquery.SchemaField("id", "INTEGER"),
        bigquery.SchemaField("bool_col", "BOOLEAN"),
        bigquery.SchemaField("tinyint_col", "INTEGER"),
        bigquery.SchemaField("smallint_col", "INTEGER"),
        bigquery.SchemaField("int_col", "INTEGER"),
        bigquery.SchemaField("bigint_col", "INTEGER"),
        bigquery.SchemaField("float_col", "FLOAT"),
        bigquery.SchemaField("double_col", "FLOAT"),
        bigquery.SchemaField("date_string_col", "STRING"),
        bigquery.SchemaField("string_col", "STRING"),
        bigquery.SchemaField("timestamp_col", "TIMESTAMP"),
        bigquery.SchemaField("year", "INTEGER"),
        bigquery.SchemaField("month", "INTEGER"),
    ]
    table.time_partitioning = bigquery.TimePartitioning(
        type_=bigquery.TimePartitioningType.DAY)
    table.require_partition_filter = False
    bqclient.create_table(table, exists_ok=True)
    return table
    def write_to_bq(self, date, messages):
        bq_client = bigquery.Client(project=self.project)

        table_name = f"messages_v{self.version}"
        if self.table_prefix is not None:
            table_name = f"{self.table_prefix}_{table_name}"

        load_config = bigquery.LoadJobConfig(
            time_partitioning=bigquery.TimePartitioning(
                field="load_date",
                require_partition_filter=True,
            ),
            create_disposition=bigquery.CreateDisposition.CREATE_IF_NEEDED,
            write_disposition=bigquery.WriteDisposition.WRITE_TRUNCATE,
            schema_update_options=bigquery.SchemaUpdateOption.
            ALLOW_FIELD_ADDITION,
        )
        load_job = bq_client.load_table_from_json(
            messages,
            destination=
            f"{self.bq_dataset}.{table_name}${date.replace('-', '')}",
            job_config=load_config,
        )

        load_job.result()
示例#19
0
    def create_empty_table(self,
                           project_id,
                           dataset_id,
                           table_id,
                           schema,
                           partition_field=None,
                           expiration=None):
        """Create table function.

        Create a table in the project_id/dataset at bigQuery.
        If the table alredy exists its gona be overwrited.

        Args:
            project_id (str): BigQuery project id
            table_id   (str): Name of the table to createTable.
            query      (str): Query to store as a table.

        Returns:
            bool: True for success, Raises an error otherwise.

        """

        table_ref = self.bigquery_client.dataset(dataset_id,
                                                 project_id).table(table_id)
        if isinstance(schema, list):
            schema = self._get_schema_from_json(schema)
        elif isinstance(schema, str):
            schema = self._get_schema_from_str(schema)
        table = bigquery.Table(table_ref, schema=schema)
        if partition_field:
            expiration_ms = expiration * 24 * 60 * 60 * 1000 if expiration else None
            partitioning = bigquery.TimePartitioning(
                field=partition_field, expiration_ms=expiration_ms)
            table.time_partitioning = partitioning
        table_ref = self.bigquery_client.create_table(table)
示例#20
0
def execute_query(bq_client: bigquery.Client, env_vars: {}, query_path: object,
                  output_table_name: str, time_partition: bool) -> None:
    """Executes transformation query to a new destination table.
    Args:
        bq_client: bigquery.Client object
        env_vars: Dictionary of key: value, where value is environment variable
        query_path: Object representing location of SQL query to execute
        output_table_name: String representing name of table that holds output
        time_partition: Boolean indicating whether to time-partition output
    """
    dataset_ref = bq_client.get_dataset(
        bigquery.DatasetReference(project=bq_client.project,
                                  dataset_id=env_vars['corrected_dataset_id']))
    table_ref = dataset_ref.table(output_table_name)
    job_config = bigquery.QueryJobConfig()
    job_config.destination = table_ref
    job_config.write_disposition = bigquery.WriteDisposition().WRITE_TRUNCATE

    # Time Partitioning table is only needed for final output query
    if time_partition:
        job_config.time_partitioning = bigquery.TimePartitioning(
            field='usage_start_time', expiration_ms=None)
    logging.info('Attempting query...')
    # Execute Query
    query_job = bq_client.query(query=render_template(query_path, env_vars),
                                job_config=job_config)

    query_job.result()  # Waits for the query to finish
示例#21
0
def tables(bq, dataset, generated_test):
    # load tables into dataset
    for table in generated_test.tables.values():
        destination = f"{dataset.dataset_id}.{table.name}"
        job_config = bigquery.LoadJobConfig(
            default_dataset=dataset,
            source_format=table.source_format,
            write_disposition=bigquery.WriteDisposition.WRITE_TRUNCATE,
        )
        if table.schema is None:
            # autodetect schema if not provided
            job_config.autodetect = True
        else:
            job_config.schema = table.schema
            # look for time_partitioning_field in provided schema
            for field in job_config.schema:
                if field.description == "time_partitioning_field":
                    job_config.time_partitioning = bigquery.TimePartitioning(
                        field=field.name
                    )
                    break  # stop because there can only be one time partitioning field
        with open(table.source_path, "rb") as file_obj:
            job = bq.load_table_from_file(file_obj, destination, job_config=job_config)
        try:
            job.result()
        except BadRequest:
            print(job.errors)
            raise
def exist_dataset_table(client, table_id, dataset_id, project_id, schema, clustering_fields=None):

    try:
        dataset_ref = "{}.{}".format(project_id, dataset_id)
        client.get_dataset(dataset_ref)  # Make an API request.

    except NotFound:
        dataset_ref = "{}.{}".format(project_id, dataset_id)
        dataset = bigquery.Dataset(dataset_ref)
        dataset.location = "US"
        dataset = client.create_dataset(dataset)  # Make an API request.
        logger.info("Created dataset {}.{}".format(client.project, dataset.dataset_id))

    try:
        table_ref = "{}.{}.{}".format(project_id, dataset_id, table_id)
        client.get_table(table_ref)  # Make an API request.

    except NotFound:

        table_ref = "{}.{}.{}".format(project_id, dataset_id, table_id)

        table = bigquery.Table(table_ref, schema=schema)

        table.time_partitioning = bigquery.TimePartitioning(
            type_=bigquery.TimePartitioningType.DAY,
            field="date"
        )

        if clustering_fields is not None:
            table.clustering_fields = clustering_fields

        table = client.create_table(table)  # Make an API request.
        logger.info("Created table {}.{}.{}".format(table.project, table.dataset_id, table.table_id))

    return 'ok'
示例#23
0
def test_create_partitioned_table(client, to_delete):
    dataset_id = "create_table_partitioned_{}".format(_millis())
    project = client.project
    dataset_ref = bigquery.DatasetReference(project, dataset_id)
    dataset = client.create_dataset(dataset_ref)
    to_delete.append(dataset)

    # [START bigquery_create_table_partitioned]
    # from google.cloud import bigquery
    # client = bigquery.Client()
    # project = client.project
    # dataset_ref = bigquery.DatasetReference(project, 'my_dataset')

    table_ref = dataset_ref.table("my_partitioned_table")
    schema = [
        bigquery.SchemaField("name", "STRING"),
        bigquery.SchemaField("post_abbr", "STRING"),
        bigquery.SchemaField("date", "DATE"),
    ]
    table = bigquery.Table(table_ref, schema=schema)
    table.time_partitioning = bigquery.TimePartitioning(
        type_=bigquery.TimePartitioningType.DAY,
        field="date",  # name of column to use for partitioning
        expiration_ms=7776000000,
    )  # 90 days

    table = client.create_table(table)

    print("Created table {}, partitioned on column {}".format(
        table.table_id, table.time_partitioning.field))
    # [END bigquery_create_table_partitioned]

    assert table.time_partitioning.type_ == "DAY"
    assert table.time_partitioning.field == "date"
    assert table.time_partitioning.expiration_ms == 7776000000
示例#24
0
    def run_query(self, date, autodetect=False):
        dataset_ref = self.client.dataset(self.config["params"]["dataset"])
        job_config = bigquery.LoadJobConfig()
        job_config.write_disposition = (
            bigquery.WriteDisposition.WRITE_APPEND
            if self.is_write_append()
            else bigquery.WriteDisposition.WRITE_TRUNCATE
        )
        # don't do autodetect after schema created, may have errors on STRING/INTEGER
        job_config.autodetect = autodetect
        job_config.source_format = FILETYPES[self.config["filetype"]]
        if "partition_field" in self.config:
            job_config.time_partitioning = bigquery.TimePartitioning(
                type_=bigquery.TimePartitioningType.DAY,
                field=self.config["partition_field"],
            )
        uri = "gs://%s" % self.config["params"]["src"].format(start_date=date)

        load_job = self.client.load_table_from_uri(
            uri,
            dataset_ref.table(self.config["params"]["dest"]),
            location=self.config["params"]["location"],
            job_config=job_config,
        )
        log.info("Starting job {}".format(load_job.job_id))

        load_job.result()  # Waits for table load to complete.
        log.info("Job finished.")

        destination_table = self.client.get_table(
            dataset_ref.table(self.config["params"]["dest"])
        )
        log.info("Loaded {} rows.".format(destination_table.num_rows))
示例#25
0
def stripe_import(
    api_key: Optional[str],
    date: Optional[datetime],
    before_date: Optional[datetime],
    table: Optional[str],
    format_resources: bool,
    resource: Type[ListableAPIResource],
):
    """Import Stripe data into BigQuery."""
    if resource is stripe.Event and not date:
        click.echo("must specify --date for --resource=Event")
        sys.exit(1)
    if table and date:
        table = f"{table}${date:%Y%m%d}"
    with (TemporaryFile(mode="w+b") if table else sys.stdout.buffer) as file_obj:
        filtered_schema = FilteredSchema(resource)
        has_rows = False
        for row in _get_rows(api_key, date, before_date, resource):
            if format_resources or (table and api_key):
                row = filtered_schema.format_row(row)
            file_obj.write(ujson.dumps(row).encode("UTF-8"))
            file_obj.write(b"\n")
            has_rows = True
        if not has_rows:
            click.echo(f"no {filtered_schema.type}s returned")
            sys.exit(1)
        elif table:
            if file_obj.writable():
                file_obj.seek(0)
            warnings.filterwarnings("ignore", module="google.auth._default")
            job_config = bigquery.LoadJobConfig(
                clustering_fields=["created"],
                ignore_unknown_values=False,
                schema=filtered_schema.filtered,
                source_format=bigquery.SourceFormat.NEWLINE_DELIMITED_JSON,
                time_partitioning=bigquery.TimePartitioning(field="created"),
                write_disposition=bigquery.WriteDisposition.WRITE_TRUNCATE,
            )
            if "$" in table:
                job_config.schema_update_options = [
                    bigquery.SchemaUpdateOption.ALLOW_FIELD_ADDITION
                ]
            job = bigquery.Client().load_table_from_file(
                file_obj=file_obj,
                destination=table,
                job_config=job_config,
            )
            try:
                click.echo(f"Waiting for {job.job_id}", file=sys.stderr)
                job.result()
            except Exception as e:
                click.echo(f"{job.job_id} failed: {e}", file=sys.stderr)
                for error in job.errors or ():
                    message = error.get("message")
                    if message and message != getattr(e, "message", None):
                        click.echo(message, file=sys.stderr)
                sys.exit(1)
            else:
                click.echo(f"{job.job_id} succeeded", file=sys.stderr)
def set_defaults_save_job_config(job_config):
    job_config.write_disposition = bigquery.WriteDisposition.WRITE_APPEND
    job_config.create_disposition = bigquery.CreateDisposition.CREATE_IF_NEEDED
    job_config.time_partitioning = bigquery.TimePartitioning(
        type_=bigquery.TimePartitioningType.DAY,
        # If not set [field], the table is partitioned by pseudo column ``_PARTITIONTIME``.
        field=None)
    return job_config
示例#27
0
def create_timestamp_table(bqclient, timestamp_table):
    table = bigquery.Table(timestamp_table)
    table.schema = [
        bigquery.SchemaField("my_timestamp_parted_col", "DATE"),
        bigquery.SchemaField("string_col", "STRING"),
        bigquery.SchemaField("int_col", "INTEGER"),
    ]
    table.time_partitioning = bigquery.TimePartitioning(
        field="my_timestamp_parted_col")
    bqclient.create_table(table, exists_ok=True)
示例#28
0
def setting_options(load_job_config, schema, config, kind_name, update_date):
    """
     GCSにあるconfig.jsonとschema.jsonを取得・利用し、
     BigQueryのロードジョブの設定を行う。

     Args:
       load_job_config (google.cloud.bigquery.job.LoadJobConfig): BigQueryへのロードに使用する設定
       config (dict): GCSから読み込んだBigQueryロードの設定
       schema (dict): GCSから読み込んだBigQueryのスキーマ
       kind_name (STRING): データの種類
       update_date (STRING): ファイル連携日
     Returns:
       job_config (google.cloud.bigquery.job.LoadJobConfig): BigQueryへのロードに使用する設定
       dataset_id (STRING): BigQueryのdatasetID 
       table_id (STRING): BigQueryのtableID 
     """

    new_schema = list()
    # スキーマファイルをインポート
    for i in range(len(schema)):
        try:
            new_schema.append(
                bigquery.SchemaField(schema[i]["name"], schema[i]["type"],
                                     schema[i]["mode"]))
        except:
            new_schema.append(
                bigquery.SchemaField(schema[i]["name"], schema[i]["type"]))

    # 設定ファイルにスキーマを代入
    load_job_config.schema = new_schema
    # config.jsonの設定をインポート
    load_job_config.source_format = config["sourceFormat"]
    load_job_config.autodetect = config["autodetect"]
    load_job_config.create_disposition = config["createDisposition"]
    load_job_config.write_disposition = config["writeDisposition"]
    load_job_config.skip_leading_rows = config["skipLeadingRows"]
    load_job_config.null_marker = config["nullMarker"]
    load_job_config.encoding = config["encoding"]
    load_job_config.max_bad_records = config["maxBadRecords"]
    load_job_config.allow_quoted_newlines = config["allowQuotedNewlines"]
    load_job_config.allow_jagged_rows = config["allowJaggedRows"]
    load_job_config.quote_character = config["quote"]
    load_job_config.ignore_unknown_values = config["ignoreUnknownValues"]

    # BigQueryのデータセット・テーブルIDの取得
    dataset_id = config["destinationTable"]["datasetId"]
    table_id = config["destinationTable"]["tableId"]

    # transactionの場合にはパーティションを作成するので、その設定。
    if (kind_name == "transaction"):
        table_id = "{}${}".format(table_id, update_date)
        load_job_config.time_partitioning = bigquery.TimePartitioning(
            type_=bigquery.TimePartitioningType.DAY)

    return load_job_config, dataset_id, table_id
示例#29
0
def init_big_query_uploader(project_id, dataset_id):
    uploader = BigQueryUploader(project_id, dataset_id)
    bigquery_ready = uploader.is_dataset_ready()
    if not bigquery_ready:
        print("Unable to connect to dataset {}, project {}, quitting".format(dataset_id, project_id))
        return None

    # Create tables if not exist
    date_col_partitioning = bigquery.TimePartitioning(
        type_=bigquery.TimePartitioningType.DAY,
        field="date",
        # expiration_ms=BQ_STORAGE_DATA_EXPIRATION_MS,
    )

    tables_and_schemas = {
        "browsers": bq_schema.browsers(),
        "browser_users": bq_schema.browser_users(),

        "aggregated_browser_days": bq_schema.aggregated_browser_days(),
        "aggregated_browser_days_tags": bq_schema.aggregated_browser_days_tags(),
        "aggregated_browser_days_categories": bq_schema.aggregated_browser_days_categories(),
        "aggregated_browser_days_referer_mediums": bq_schema.aggregated_browser_days_referer_mediums(),

        "aggregated_user_days": bq_schema.aggregated_user_days(),
        "aggregated_user_days_tags": bq_schema.aggregated_user_days_tags(),
        "aggregated_user_days_categories": bq_schema.aggregated_user_days_categories(),
        "aggregated_user_days_referer_mediums": bq_schema.aggregated_user_days_referer_mediums(),
    }
    # tables partitioned by 'date' column
    for table_name, table_schema in tables_and_schemas.items():
        if not uploader.table_exists(table_name):
            uploader.create_table(table_name, table_schema, date_col_partitioning)

    # tables partitioned by 'time' column
    time_col_partitioning = bigquery.TimePartitioning(
        type_=bigquery.TimePartitioningType.DAY,
        field="time",
        # expiration_ms=BQ_STORAGE_DATA_EXPIRATION_MS,
    )
    if not uploader.table_exists('events'):
        uploader.create_table('events', bq_schema.events(), time_col_partitioning)
    return uploader
示例#30
0
def _run_project_sql(bq_client, project, dataset, submission_date, slug):
    normalized_slug = _bq_normalize_name(slug)
    date_partition = str(submission_date).replace("-", "")
    destination_table = f"{project}.{dataset}.{normalized_slug}${date_partition}"
    print("destination_table", destination_table)
    init_sql_path = Path(
        os.path.join(OUTPUT_DIR, dataset, normalized_slug, INIT_FILENAME))
    query_sql_path = Path(
        os.path.join(OUTPUT_DIR, dataset, normalized_slug, QUERY_FILENAME))
    view_sql_path = Path(
        os.path.join(OUTPUT_DIR, dataset, normalized_slug, VIEW_FILENAME))

    query_config = bigquery.QueryJobConfig(
        query_parameters=[
            bigquery.ScalarQueryParameter("submission_date", "DATE",
                                          str(submission_date)),
        ],
        use_legacy_sql=False,
        clustering_fields=["build_id"],
        destination=destination_table,
        default_dataset=f"{project}.{dataset}",
        time_partitioning=bigquery.TimePartitioning(field="submission_date"),
        write_disposition="WRITE_TRUNCATE",
        use_query_cache=True,
        allow_large_results=True,
    )
    init_query_text = init_sql_path.read_text()
    query_text = query_sql_path.read_text()
    view_text = view_sql_path.read_text()

    # Wait for init to complete before running queries
    init_query_job = bq_client.query(init_query_text)
    view_query_job = bq_client.query(view_text)
    results = init_query_job.result()

    query_job = bq_client.query(query_text, job_config=query_config)

    # Periodically print so airflow gke operator doesn't think task is dead
    elapsed = 0
    while not query_job.done():
        time.sleep(10)
        elapsed += 10
        if elapsed % 200 == 10:
            print("Waiting on query...")

    print(f"Total elapsed: approximately {elapsed} seconds")
    results = query_job.result()

    print(f"Query job {query_job.job_id} finished")
    print(f"{results.total_rows} rows in {destination_table}")

    # Add a view once the derived table is generated.
    view_query_job.result()