Exemplo n.º 1
0
 def test_listing_tables(self):
     # type: () -> None
     dataset_ref1 = DatasetReference('my_project', 'dataset1')
     self.bq_client.create_dataset(Dataset(dataset_ref1))
     self.bq_client.create_table(Table(TableReference(dataset_ref1, 'table1'), _TEST_SCHEMA))
     self.bq_client.create_table(Table(TableReference(dataset_ref1, 'table2'), []))
     six.assertCountEqual(
         self,
         [table_ref.table_id for table_ref in self.bq_client.list_tables(dataset_ref1)],
         ['table1', 'table2'])
Exemplo n.º 2
0
    def setUp(self):
        super(ClientWriteFromQueryTest, self).setUp()
        dataset_ref = DatasetReference(self.bq_client.project, 'my_dataset')
        schema = [SchemaField(name="a", field_type='INT64'),
                  SchemaField(name="b", field_type='FLOAT64'),
                  ]
        self.source_table = Table(TableReference(dataset_ref, 'source_table'), schema)
        self.destination_table = Table(TableReference(dataset_ref, 'destination_table'), schema)
        self.bq_client.create_dataset(Dataset(dataset_ref))
        self.bq_client.create_table(self.source_table)
        # We don't create the destination table here; some tests do not want it created.

        # Stick two rows into source_table
        self.assertFalse(self.bq_client.insert_rows(self.source_table,
                                                    [{'a': 1, 'b': 2.5}, {'a': 3, 'b': 4.25}]))
Exemplo n.º 3
0
def load(project_id, bq_client, src_dataset_id, dst_dataset_id):
    """
    Transform safely loaded tables and store results in target dataset.

    :param project_id: Identifies the BQ project
    :param bq_client: a BigQuery client object
    :param src_dataset_id: reference to source dataset object
    :param dst_dataset_id: reference to destination dataset object
    :return: List of BQ job_ids
    """
    dst_dataset = Dataset(f'{bq_client.project}.{dst_dataset_id}')
    dst_dataset.description = f'Vocabulary cleaned and loaded from {src_dataset_id}'
    dst_dataset.labels = {'type': 'vocabulary'}
    dst_dataset.location = "US"
    bq_client.create_dataset(dst_dataset, exists_ok=True)
    src_tables = list(bq_client.list_tables(dataset=src_dataset_id))

    job_config = QueryJobConfig()
    query_jobs = []
    for src_table in src_tables:
        schema = bq.get_table_schema(src_table.table_id)
        destination = f'{project_id}.{dst_dataset_id}.{src_table.table_id}'
        table = bq_client.create_table(Table(destination, schema=schema),
                                       exists_ok=True)
        job_config.destination = table
        query = SELECT_TPL.render(project_id=project_id,
                                  dataset_id=src_dataset_id,
                                  table=src_table.table_id,
                                  fields=schema)
        query_job = bq_client.query(query, job_config=job_config)
        LOGGER.info(f'table:{destination} job_id:{query_job.job_id}')
        query_jobs.append(query_job)
        query_job.result()
    return query_jobs
Exemplo n.º 4
0
def copy_fitbit_tables_from_views(client, from_dataset, to_dataset,
                                  table_prefix):
    """
    Copies tables from views with prefix

    :param client: bq client
    :param from_dataset: dataset containing views
    :param to_dataset: dataset to create tables
    :param table_prefix: prefix added to table_ids
    :return:
    """
    for table in FITBIT_TABLES:
        schema_list = bq.get_table_schema(table)
        fq_dest_table = f'{client.project}.{to_dataset}.{table}'
        dest_table = Table(fq_dest_table, schema=schema_list)
        dest_table = client.create_table(dest_table)
        LOGGER.info(f'Created empty table {fq_dest_table}')

        fields_name_str = ',\n'.join([item.name for item in schema_list])
        fields_casted_str = ',\n'.join([
            cast_to_schema_type(item.name, item.field_type)
            for item in schema_list
        ])
        content_query = INSERT_QUERY.render(fq_dest_table=fq_dest_table,
                                            fields=fields_name_str,
                                            fields_casted=fields_casted_str,
                                            client=client,
                                            from_dataset=from_dataset,
                                            table_prefix=table_prefix,
                                            table=table)
        job = client.query(content_query)
        job.result()

    LOGGER.info(f'Copied fitbit tables from `{from_dataset}` to `{to_dataset}`')
Exemplo n.º 5
0
    def test_should_insert_records_to_non_partitioned_table(self):
        # given
        table_id = f'{self.dataset_manager.project_id}.{self.dataset_manager.dataset_name}.example_test_table'
        table = Table(table_id,
                      schema=[
                          {
                              "mode": "NULLABLE",
                              "name": "example_field",
                              "type": "STRING"
                          },
                      ])
        self.dataset_manager.create_table_from_schema('example_test_table',
                                                      table=table)

        self.dataset_manager.insert('example_test_table',
                                    [{
                                        "example_field": "example_field_value"
                                    }],
                                    partitioned=False)

        # then
        expected_result = [{"example_field": "example_field_value"}]
        actual_result = self.dataset_manager.collect('''
        SELECT * 
        FROM `{example_test_table}`
        ''').to_dict(orient='records')
        self.assertEqual(expected_result, actual_result)
Exemplo n.º 6
0
def insert_rows_from_uri(client, job_location, dataset, table_name, table_schema, table_uri, load_job_id, timeout=10):
    """
    Insert rows from remote
    :param client: BQ Client
    :param job_location : BQ job location
    :param dataset: BQ Dataset
    :param table_name: The table string name
    :param table_schema: The table schema
    :param table_uri: The GCS blob uri to insert
    :param load_job_id: Load job ID
    :param timeout : optional timeout, default to 10
    :return: The created Table
    """
    table_ref = dataset.table(table_name)
    table = Table(table_ref, table_schema)
    table_load_job_conf = bigquery.LoadJobConfig(
        schema=table_schema,
        source_format=bigquery.SourceFormat.NEWLINE_DELIMITED_JSON
    )
    load_job = client.load_table_from_uri(
        table_uri,
        table,
        job_id=load_job_id,
        location=job_location,
        job_config=table_load_job_conf
    )
    try:
        load_job.result(timeout=timeout)
        rows = client.list_rows(table, timeout=timeout)
        print_rows(rows)
    except Exception as error:
        logging.error(error)
    else:
        return table
Exemplo n.º 7
0
    def test_table_lookup(self):
        # type: () -> None
        dataset_ref1 = DatasetReference('my_project', 'dataset1')
        table_ref1 = TableReference(dataset_ref1, 'table1')
        table1 = Table(table_ref1, _TEST_SCHEMA)

        # Trying to get the same dataset/table in another project doesn't work.
        with self.assertRaisesRegexp(NotFound, 'other_project'):
            self.bq_client.get_table(
                TableReference(DatasetReference('other_project', 'dataset1'),
                               'table1'))

        # Trying to get the table before the dataset exists doesn't work
        with self.assertRaisesRegexp(NotFound, 'dataset1'):
            self.bq_client.get_table(table_ref1)
        self.bq_client.create_dataset(Dataset(dataset_ref1))

        # Trying to get the table before the table exists doesn't work
        with self.assertRaises(NotFound):
            self.bq_client.get_table(table_ref1)

        self.bq_client.create_table(table1)

        # Assert the created table has the expected properties.
        table_found = self.bq_client.get_table(table_ref1)
        self.assertEqual(table1.project, "my_project")
        self.assertEqual(table1.dataset_id, "dataset1")
        self.assertEqual(table1.table_id, "table1")
        six.assertCountEqual(self, table_found.schema, _TEST_SCHEMA)
Exemplo n.º 8
0
def make_table(project,
               dataset_id,
               table_id,
               friendly_name=None,
               description=None,
               expires=None,
               partitioning_type=None,
               view_use_legacy_sql=None,
               view_query=None,
               schema=None,
               labels=None):
    dataset_ref = DatasetReference(project, dataset_id)
    table_ref = TableReference(dataset_ref, table_id)
    table = Table(table_ref)
    table.friendly_name = friendly_name
    table.description = description
    table.expires = expires
    table.partitioning_type = partitioning_type
    if view_use_legacy_sql is not None:
        table.view_use_legacy_sql = view_use_legacy_sql
    if view_query is not None:
        table.view_query = view_query
    table.schema = schema
    if labels is not None:
        table.labels = labels
    return table
Exemplo n.º 9
0
        def create_view_task():

            client = Client()

            dest_table_name = '{task}'.format(task=task)
            dest_table_ref = client.dataset(
                dataset_name,
                project=destination_dataset_project_id).table(dest_table_name)
            table = Table(dest_table_ref)

            sql_path = os.path.join(
                dags_folder,
                'resources/stages/enrich/sqls/{task}.sql'.format(task=task))
            sql = read_file(sql_path, environment)
            table.view_query = sql

            description_path = os.path.join(
                dags_folder,
                'resources/stages/enrich/descriptions/{task}.txt'.format(
                    task=task))
            table.description = read_file(description_path)
            logging.info('Creating view: ' + json.dumps(table.to_api_repr()))

            try:
                table = client.create_table(table)
            except Conflict:
                # https://cloud.google.com/bigquery/docs/managing-views
                table = client.update_table(table, ['view_query'])
            assert table.table_id == dest_table_name
Exemplo n.º 10
0
        def create_view_task(ds, **kwargs):

            template_context = kwargs.copy()
            template_context['ds'] = ds
            template_context['params'] = environment

            client = Client()

            dest_table_name = '{task}'.format(task=task)
            dest_table_ref = client.dataset(
                dataset_name,
                project=destination_dataset_project_id).table(dest_table_name)
            table = Table(dest_table_ref)

            sql_path = os.path.join(
                dags_folder,
                'resources/stages/enrich/sqls/{task}.sql'.format(task=task))
            sql_template = read_file(sql_path)
            sql = kwargs['task'].render_template('', sql_template,
                                                 template_context)
            table.view_query = sql

            description_path = os.path.join(
                dags_folder,
                'resources/stages/enrich/descriptions/{task}.txt'.format(
                    task=task))
            table.description = read_file(description_path)
            logging.info('Creating view: ' + json.dumps(table.to_api_repr()))

            try:
                table = client.create_table(table)
            except Conflict:
                # https://cloud.google.com/bigquery/docs/managing-views
                table = client.update_table(table, ['view_query'])
            assert table.table_id == dest_table_name
def test_dataflow_event_big_query_success(sdc_builder, sdc_executor, gcp):
    """
    Create data using Google BigQuery client
    and then check if Google BigQuery origin receives them using wiretap, including a query successfully read event.

    The pipeline looks like:
        google_bigquery >> wiretap
    """
    pipeline_builder = sdc_builder.get_pipeline_builder()

    dataset_name = get_random_string(string.ascii_letters, 5)
    table_name = get_random_string(string.ascii_letters, 5)
    google_bigquery = pipeline_builder.add_stage('Google BigQuery',
                                                 type='origin')
    query_str = f'SELECT * FROM {dataset_name}.{table_name} ORDER BY id'
    google_bigquery.set_attributes(query=query_str)

    wiretap = pipeline_builder.add_wiretap()
    events_wiretap = pipeline_builder.add_wiretap()

    google_bigquery >> wiretap.destination
    google_bigquery >= events_wiretap.destination

    pipeline = pipeline_builder.build().configure_for_environment(gcp)
    sdc_executor.add_pipeline(pipeline)

    bigquery_client = gcp.bigquery_client

    try:
        dataset_ref = Dataset(bigquery_client.dataset(dataset_name))

        # Using Google bigquery client, create dataset, table and data inside table
        logger.info('Creating dataset %s using Google bigquery client ...',
                    dataset_name)
        bigquery_client.create_dataset(dataset_ref)
        table = bigquery_client.create_table(
            Table(dataset_ref.table(table_name), schema=SCHEMA))
        errors = bigquery_client.insert_rows(table, ROWS_TO_INSERT)
        assert not errors, 'Errors found when creating data using bigquery client'

        # Start pipeline and verify correct rows are received.
        logger.info('Starting pipeline')
        sdc_executor.start_pipeline(pipeline).wait_for_finished()

        assert len(wiretap.output_records) == len(ROWS_TO_INSERT),\
            f'Expected {len(ROWS_TO_INSERT)} records, received {len(wiretap.output_records)}'
        rows_from_wiretap = get_rows_from_wiretap(wiretap)
        assert rows_from_wiretap == ROWS_TO_INSERT

        # We have exactly one output record, check that it is a big-query-success event
        assert len(
            events_wiretap.output_records
        ) == 1, f'Expected 1 records, received {len(events_wiretap.output_records)}'
        event_record = events_wiretap.output_records[0]
        event_type = event_record.header.values['sdc.event.type']
        assert event_type == 'big-query-success', 'Received %s as event type (expected new-file)' % event_type
    finally:
        if dataset_ref:
            bigquery_client.delete_dataset(dataset_ref, delete_contents=True)
Exemplo n.º 12
0
def test_multiple_batch(sdc_builder, sdc_executor, gcp, number_batches):
    """
    Create data using Google BigQuery destination using different batch sizes
    and then check if it has been correctly created using BigQuery client.

    The pipeline looks like:
        dev_raw_data_source >> google_bigquery
    """
    number_records = 1000
    batch_size = number_records//number_batches

    pipeline_builder = sdc_builder.get_pipeline_builder()

    dev_data_generator = pipeline_builder.add_stage('Dev Data Generator')
    dev_data_generator.set_attributes(batch_size=batch_size,
                                      fields_to_generate=[
                                          {"type": "INTEGER", "field": "age"},
                                          {"type": "STRING", "field": "full_name"}
                                      ])

    dataset_name = get_random_string(string.ascii_letters, 5)
    table_name = get_random_string(string.ascii_letters, 5)
    google_bigquery = pipeline_builder.add_stage(name=DESTINATION_STAGE_NAME, type='destination')
    google_bigquery.set_attributes(dataset=dataset_name,
                                   table_name=table_name)

    dev_data_generator >> google_bigquery

    pipeline = pipeline_builder.build().configure_for_environment(gcp)
    sdc_executor.add_pipeline(pipeline)

    bigquery_client = gcp.bigquery_client

    try:
        dataset_ref = Dataset(bigquery_client.dataset(dataset_name))

        # Using Google bigquery client, create dataset, table and data inside table
        logger.info('Creating dataset %s using Google bigquery client ...', dataset_name)
        bigquery_client.create_dataset(dataset_ref)
        table = bigquery_client.create_table(Table(dataset_ref.table(table_name), schema=SCHEMA))

        # Start pipeline and verify correct rows are received.
        logger.info('Starting pipeline')
        sdc_executor.start_pipeline(pipeline).wait_for_pipeline_output_records_count(number_records)
        sdc_executor.stop_pipeline(pipeline)

        data_from_bigquery = [tuple(row.values()) for row in bigquery_client.list_rows(table)]
        data_from_bigquery.sort()

        assert data_from_bigquery == data_from_bigquery

        history = sdc_executor.get_pipeline_history(pipeline)
        records = history.latest.metrics.counter('pipeline.batchInputRecords.counter').count

        assert len(data_from_bigquery) == records
    finally:
        if dataset_ref:
            bigquery_client.delete_dataset(dataset_ref, delete_contents=True)
def test_data_type(sdc_builder, sdc_executor, gcp, data_type, data,
                   expected_data):
    """
    Create data using Google BigQuery client
    and then check if Google BigQuery origin receives them using wiretap with one data format each.

    The pipeline looks like:
        google_bigquery >> wiretap
    """
    pipeline_builder = sdc_builder.get_pipeline_builder()

    dataset_name = get_random_string(string.ascii_letters, 5)
    table_name = get_random_string(string.ascii_letters, 5)
    google_bigquery = pipeline_builder.add_stage('Google BigQuery',
                                                 type='origin')
    query_str = f'SELECT * FROM {dataset_name}.{table_name}'
    google_bigquery.set_attributes(query=query_str)

    wiretap = pipeline_builder.add_wiretap()

    google_bigquery >> wiretap.destination

    pipeline = pipeline_builder.build().configure_for_environment(gcp)
    sdc_executor.add_pipeline(pipeline)

    bigquery_client = gcp.bigquery_client
    schema = [
        SchemaField('id', 'INTEGER', mode='NULLABLE'),
        SchemaField('data', data_type, mode='NULLABLE')
    ]

    try:
        dataset_ref = Dataset(bigquery_client.dataset(dataset_name))

        # Using Google bigquery client, create dataset, table and data inside table
        logger.info('Creating dataset %s using Google bigquery client ...',
                    dataset_name)
        bigquery_client.create_dataset(dataset_ref)
        table = bigquery_client.create_table(
            Table(dataset_ref.table(table_name), schema=schema))
        errors = bigquery_client.insert_rows(table, [(None, data)])
        assert not errors, 'Errors found when creating data using bigquery client'

        # Start pipeline and verify correct rows are received.
        logger.info('Starting pipeline')

        sdc_executor.start_pipeline(pipeline).wait_for_finished()

        assert len(
            wiretap.output_records
        ) == 1, f'Expected 1 record, received {len(wiretap.output_records)}'
        received_data = wiretap.output_records[0].field['data']

        assert received_data == expected_data
    finally:
        if dataset_ref:
            bigquery_client.delete_dataset(dataset_ref, delete_contents=True)
Exemplo n.º 14
0
def test_google_bigquery_origin_batch_handling(sdc_builder, sdc_executor, gcp):
    """Verify proper batch handling by the BigQuery origin.

    In this test, we write 8 records to BigQuery with a batch size of 3, verifying that each batch is of
    size 3 and that 8 total records are captured. The small numbers are used because of the limitations
    described in SDC-8765.
    """
    MAX_BATCH_SIZE = 3
    TOTAL_RECORDS = 8

    dataset_name = get_random_string(ascii_letters, 5)
    table_name = get_random_string(ascii_letters, 5)
    query_str = f'SELECT * FROM {dataset_name}.{table_name} ORDER BY number'

    pipeline_builder = sdc_builder.get_pipeline_builder()
    google_bigquery = pipeline_builder.add_stage('Google BigQuery', type='origin')
    google_bigquery.set_attributes(query=query_str, max_batch_size_in_records=MAX_BATCH_SIZE)
    trash = pipeline_builder.add_stage('Trash')
    google_bigquery >> trash

    pipeline = pipeline_builder.build().configure_for_environment(gcp)
    sdc_executor.add_pipeline(pipeline)

    bigquery_client = gcp.bigquery_client
    schema = [SchemaField('number', 'STRING', mode='required'),
              SchemaField('value', 'STRING', mode='required')]

    data = [dict(number=str(i), value=get_random_string()) for i in range(TOTAL_RECORDS)]
    dataset = Dataset(bigquery_client.dataset(dataset_name))

    try:
        # Using Google bigquery client, create dataset, table and data inside table.
        logger.info('Creating dataset %s using Google bigquery client ...', dataset_name)
        bigquery_client.create_dataset(dataset)
        table = bigquery_client.create_table(Table(dataset.table(table_name), schema=schema))
        errors = bigquery_client.insert_rows(table, data)
        assert not errors

        # Ceiling division is needed to capture all the complete batches along with the last partial one.
        snapshot = sdc_executor.capture_snapshot(pipeline,
                                                 start_pipeline=True,
                                                 batches=math.ceil(TOTAL_RECORDS / MAX_BATCH_SIZE)).snapshot

        # Assert that the batch size is being respected in each batch (including the last). In this case,
        # we'd expect batch sizes of 3, 3, and 2.
        for i, batch in enumerate(snapshot.snapshot_batches, start=1):
            #  for 8 records, we'd expect batch sizes of 3, 3, and 2.
            assert (len(batch.stage_outputs[google_bigquery.instance_name].output) == MAX_BATCH_SIZE
                    if i * MAX_BATCH_SIZE <= TOTAL_RECORDS
                    else TOTAL_RECORDS % MAX_BATCH_SIZE)

        all_records = [record.field
                       for batch in snapshot.snapshot_batches
                       for record in batch.stage_outputs[google_bigquery.instance_name].output]
        assert all_records == data
    finally:
        bigquery_client.delete_dataset(dataset, delete_contents=True)
Exemplo n.º 15
0
 def test_listing_tables_with_max(self):
     # type: () -> None
     dataset_ref1 = DatasetReference('my_project', 'dataset1')
     self.bq_client.create_dataset(Dataset(dataset_ref1))
     for i in range(10):
         self.bq_client.create_table(Table(TableReference(dataset_ref1, 'table{}'.format(i)),
                                           _TEST_SCHEMA))
     self.assertEqual(5, len(self.bq_client.list_tables(dataset_ref1, max_results=5)))
     self.assertEqual(10, len(self.bq_client.list_tables(dataset_ref1, max_results=20)))
     self.assertEqual(10, len(self.bq_client.list_tables(dataset_ref1)))
def create_table(client, dataset_ref, table_name, is_partitioned=False):
    table_ref = dataset_ref.table(table_name)
    table_obj = Table(table_ref, schema=TABLE_SCHEMA)

    if is_partitioned:
        time_partitioning = TimePartitioning()
        time_partitioning.field = 'partition_value'
        table_obj.time_partitioning = time_partitioning

    return client.create_table(table_obj)
Exemplo n.º 17
0
def test_google_bigquery_destination(sdc_builder, sdc_executor, gcp):
    """
    Send data to Google BigQuery from Dev Raw Data Source and
    confirm that Google BigQuery destination successfully recieves them using Google BigQuery client.

    This is achieved by using a deduplicator which assures that there is only one ingest to Google BigQuery.
    The pipeline looks like:
        dev_raw_data_source >> record_deduplicator >> google_bigquery
                               record_deduplicator >> trash
    """
    pipeline_builder = sdc_builder.get_pipeline_builder()

    dev_raw_data_source = pipeline_builder.add_stage('Dev Raw Data Source')
    dev_raw_data_source.set_attributes(data_format='DELIMITED',
                                       header_line='WITH_HEADER',
                                       raw_data='\n'.join(CSV_DATA_TO_INSERT))

    dataset_name = get_random_string(ascii_letters, 5)
    table_name = get_random_string(ascii_letters, 5)
    google_bigquery = pipeline_builder.add_stage('Google BigQuery', type='destination')
    google_bigquery.set_attributes(dataset=dataset_name,
                                   table_name=table_name)

    record_deduplicator = pipeline_builder.add_stage('Record Deduplicator')
    trash = pipeline_builder.add_stage('Trash')

    dev_raw_data_source >> record_deduplicator >> google_bigquery
    record_deduplicator >> trash

    pipeline = pipeline_builder.build(title='Google BigQuery Destination').configure_for_environment(gcp)
    sdc_executor.add_pipeline(pipeline)

    bigquery_client = gcp.bigquery_client
    schema = [SchemaField('full_name', 'STRING', mode='required'),
              SchemaField('age', 'INTEGER', mode='required')]
    dataset_ref = Dataset(bigquery_client.dataset(dataset_name))

    try:
        logger.info('Creating dataset %s using Google BigQuery client ...', dataset_name)
        dataset = bigquery_client.create_dataset(dataset_ref)
        table = bigquery_client.create_table(Table(dataset_ref.table(table_name), schema=schema))

        logger.info('Starting BigQuery Destination pipeline and waiting for it to produce records ...')
        sdc_executor.start_pipeline(pipeline).wait_for_pipeline_batch_count(1)

        logger.info('Stopping BigQuery Destination pipeline and getting the count of records produced in total ...')
        sdc_executor.stop_pipeline(pipeline)

        # Verify by reading records using Google BigQuery client
        data_from_bigquery = [tuple(row.values()) for row in bigquery_client.list_rows(table)]
        data_from_bigquery.sort()
        logger.debug('read_data = {}'.format(data_from_bigquery))
        assert ROWS_EXPECTED == data_from_bigquery
    finally:
        bigquery_client.delete_dataset(dataset_ref, delete_contents=True)
Exemplo n.º 18
0
def test_object_names_columns(sdc_builder, sdc_executor, gcp, column_name):
    """
    Create data using Google BigQuery destination with different column names
    and then check if it has been correctly created using BigQuery client.

    The pipeline looks like:
        dev_raw_data_source >> google_bigquery
    """
    data_to_insert = [(1, 'data')]
    rows_to_insert = [f'id,{column_name}'] + [','.join(str(element) for element in row) for row in data_to_insert]
    data_to_expect = [(1, 'data')]
    pipeline_builder = sdc_builder.get_pipeline_builder()

    dev_raw_data_source = pipeline_builder.add_stage('Dev Raw Data Source')
    dev_raw_data_source.set_attributes(data_format='DELIMITED',
                                       header_line='WITH_HEADER',
                                       stop_after_first_batch=True,
                                       raw_data='\n'.join(rows_to_insert))

    dataset_name = get_random_string(string.ascii_letters, 5)
    table_name = get_random_string(string.ascii_letters, 5)

    google_bigquery = pipeline_builder.add_stage(name=DESTINATION_STAGE_NAME, type='destination')
    google_bigquery.set_attributes(dataset=dataset_name,
                                   table_name=table_name)

    dev_raw_data_source >> google_bigquery

    pipeline = pipeline_builder.build().configure_for_environment(gcp)
    sdc_executor.add_pipeline(pipeline)

    bigquery_client = gcp.bigquery_client
    schema = [SchemaField('id', 'INTEGER', mode='NULLABLE'),
              SchemaField(column_name, 'STRING', mode='NULLABLE')]

    try:
        dataset_ref = Dataset(bigquery_client.dataset(dataset_name))

        # Using Google bigquery client, create dataset, table and data inside table
        logger.info('Creating dataset %s using Google bigquery client ...', dataset_name)
        bigquery_client.create_dataset(dataset_ref)
        table = bigquery_client.create_table(Table(dataset_ref.table(table_name), schema=schema))

        # Start pipeline and verify correct rows are received.
        logger.info('Starting pipeline')
        sdc_executor.start_pipeline(pipeline).wait_for_finished()

        data_from_bigquery = [tuple(row.values()) for row in bigquery_client.list_rows(table)]
        data_from_bigquery.sort()

        assert data_to_expect == data_from_bigquery
    finally:
        if dataset_ref:
            bigquery_client.delete_dataset(dataset_ref, delete_contents=True)
 def __create_test_table(self, table_name, dataset_id):
     table_schema = [
         SchemaField("int_data", "INT64"),
         SchemaField("str_data", "STRING")
     ]
     table_reference = TableReference(dataset_id, table_name)
     test_table = Table(table_reference, table_schema)
     test_table.time_partitioning = TimePartitioning("DAY")
     self.__delete_if_exists(test_table)
     self.GCP_BIGQUERY_CLIENT.create_table(test_table)
     return test_table
Exemplo n.º 20
0
def test_google_bigquery_origin(sdc_builder, sdc_executor, gcp):
    """
    Create data using Google BigQuery client
    and then check if Google BigQuery origin receives them using snapshot.

    The pipeline looks like:
        google_bigquery >> trash
    """
    pipeline_builder = sdc_builder.get_pipeline_builder()

    dataset_name = get_random_string(ascii_letters, 5)
    table_name = get_random_string(ascii_letters, 5)
    google_bigquery = pipeline_builder.add_stage('Google BigQuery',
                                                 type='origin')
    query_str = f'SELECT * FROM {dataset_name}.{table_name} ORDER BY full_name'
    google_bigquery.set_attributes(query=query_str)

    trash = pipeline_builder.add_stage('Trash')
    google_bigquery >> trash
    pipeline = pipeline_builder.build(
        title='Google BigQuery').configure_for_environment(gcp)
    sdc_executor.add_pipeline(pipeline)

    bigquery_client = gcp.bigquery_client
    schema = [
        SchemaField('full_name', 'STRING', mode='required'),
        SchemaField('age', 'INTEGER', mode='required')
    ]
    dataset_ref = Dataset(bigquery_client.dataset(dataset_name))
    try:
        # Using Google bigquery client, create dataset, table and data inside table
        logger.info('Creating dataset %s using Google bigquery client ...',
                    dataset_name)
        dataset = bigquery_client.create_dataset(dataset_ref)
        table = bigquery_client.create_table(
            Table(dataset_ref.table(table_name), schema=schema))
        errors = bigquery_client.insert_rows(table, ROWS_TO_INSERT)
        assert errors == []

        # Start pipeline and verify correct rows are received using snaphot.
        logger.info('Starting pipeline and snapshot')
        snapshot = sdc_executor.capture_snapshot(pipeline,
                                                 start_pipeline=True).snapshot
        if sdc_executor.get_pipeline_status(pipeline) == 'RUNNING':
            sdc_executor.stop_pipeline(pipeline)
        rows_from_snapshot = [(record.value['value'][0]['value'],
                               int(record.value['value'][1]['value']))
                              for record in snapshot[google_bigquery].output]

        assert rows_from_snapshot == ROWS_TO_INSERT
    finally:
        bigquery_client.delete_dataset(dataset_ref, delete_contents=True)
Exemplo n.º 21
0
 def get_table(self, dataset_name, table_name, schema):
     dataset = self.client.dataset(dataset_name)
     dataset_table_names = [
         t.table_id for t in self.client.list_tables(dataset)
     ]
     table_ref = dataset.table(table_name)
     if table_name not in dataset_table_names:
         assert schema
         table = Table(table_ref, self.schema_object(schema))
         table = self.client.create_table(table)
     else:
         table = self.client.get_table(table_ref)
     return table
Exemplo n.º 22
0
def test_object_names_datasets(sdc_builder, sdc_executor, gcp, dataset_name):
    """
    Create data using Google BigQuery destination with different dataset names
    and then check if it has been correctly created using BigQuery client.

    The pipeline looks like:
        dev_raw_data_source >> google_bigquery
    """
    pipeline_builder = sdc_builder.get_pipeline_builder()

    dev_raw_data_source = pipeline_builder.add_stage('Dev Raw Data Source')
    dev_raw_data_source.set_attributes(data_format='DELIMITED',
                                       header_line='WITH_HEADER',
                                       stop_after_first_batch=True,
                                       raw_data='\n'.join(CSV_DATA_TO_INSERT))

    table_name = get_random_string(string.ascii_letters, 5)
    # If tests fail for any reason, leftovers with equal names might lead to more errors
    dataset_name = f'{dataset_name}_{get_random_string(string.ascii_letters, 5)}'

    google_bigquery = pipeline_builder.add_stage(name=DESTINATION_STAGE_NAME, type='destination')
    google_bigquery.set_attributes(dataset=dataset_name,
                                   table_name=table_name)

    dev_raw_data_source >> google_bigquery

    pipeline = pipeline_builder.build().configure_for_environment(gcp)
    sdc_executor.add_pipeline(pipeline)

    bigquery_client = gcp.bigquery_client

    try:
        dataset_ref = Dataset(bigquery_client.dataset(dataset_name))

        # Using Google bigquery client, create dataset, table and data inside table
        logger.info('Creating dataset %s using Google bigquery client ...', dataset_name)
        bigquery_client.create_dataset(dataset_ref)
        table = bigquery_client.create_table(Table(dataset_ref.table(table_name), schema=SCHEMA))

        # Start pipeline and verify correct rows are received.
        logger.info('Starting pipeline')
        sdc_executor.start_pipeline(pipeline).wait_for_finished()

        data_from_bigquery = [tuple(row.values()) for row in bigquery_client.list_rows(table)]
        data_from_bigquery.sort()

        assert ROWS_EXPECTED == data_from_bigquery
    finally:
        if dataset_ref:
            bigquery_client.delete_dataset(dataset_ref, delete_contents=True)
Exemplo n.º 23
0
def store_digital_health_status_data(project_id,
                                     json_data,
                                     destination_table,
                                     schema=None):
    """
    Stores the fetched digital_health_sharing_status data in a BigQuery dataset.

    If the table doesn't exist, it will create that table. If the table does exist,
    it will create a partition in the designated table or append to the same partition.
    This is necessary for storing data has "RECORD" type fields which do not conform to a dataframe.
    The data is stored using a JSON file object since it is one of the ways BigQuery expects it.
    :param project_id: identifies the project
    :param json_data: list of json objects retrieved from process_digital_health_data_to_json
    :param destination_table: fully qualified destination table name as 'project.dataset.table'
    :param schema: a list of SchemaField objects corresponding to the destination table

    :return: returns the bq job_id for the loading of digital health data
    """

    # Parameter check
    if not isinstance(project_id, str):
        raise RuntimeError(
            f'Please specify the project in which to create the table')

    client = get_client(project_id)
    if not schema:
        schema = get_table_schema(DIGITAL_HEALTH_SHARING_STATUS)

    try:
        table = client.get_table(destination_table)
    except NotFound:
        table = Table(destination_table, schema=schema)
        table.time_partitioning = TimePartitioning(
            type_=TimePartitioningType.DAY)
        table = client.create_table(table)

    file_obj = StringIO()
    for json_obj in json_data:
        json.dump(json_obj, file_obj)
        file_obj.write('\n')
    job_config = LoadJobConfig(
        source_format=SourceFormat.NEWLINE_DELIMITED_JSON, schema=schema)
    job = client.load_table_from_file(file_obj,
                                      table,
                                      rewind=True,
                                      job_config=job_config,
                                      job_id_prefix='ps_digital_health_load_')
    job.result()

    return job.job_id
def test_object_names_datasets(sdc_builder, sdc_executor, gcp, dataset_name):
    """
    Create data using Google BigQuery client with specific table names
    and then check if Google BigQuery origin receives them using wiretap.

    The pipeline looks like:
        google_bigquery >> wiretap
    """
    pipeline_builder = sdc_builder.get_pipeline_builder()

    table_name = get_random_string(string.ascii_letters, 5)
    dataset_name = f'{dataset_name}_{get_random_string(string.ascii_letters, 5)}'
    google_bigquery = pipeline_builder.add_stage('Google BigQuery',
                                                 type='origin')
    query_str = f"SELECT * FROM `{dataset_name}`.`{table_name}` ORDER BY id"
    google_bigquery.set_attributes(query=query_str)

    wiretap = pipeline_builder.add_wiretap()

    google_bigquery >> wiretap.destination

    pipeline = pipeline_builder.build().configure_for_environment(gcp)
    sdc_executor.add_pipeline(pipeline)

    bigquery_client = gcp.bigquery_client

    try:
        dataset_ref = Dataset(bigquery_client.dataset(dataset_name))

        # Using Google bigquery client, create dataset, table and data inside table
        logger.info('Creating dataset %s using Google bigquery client ...',
                    dataset_name)
        bigquery_client.create_dataset(dataset_ref)
        table = bigquery_client.create_table(
            Table(dataset_ref.table(table_name), schema=SCHEMA))
        errors = bigquery_client.insert_rows(table, ROWS_TO_INSERT)
        assert not errors, 'Errors found when creating data using bigquery client'

        # Start pipeline and verify correct rows are received.
        logger.info('Starting pipeline')
        sdc_executor.start_pipeline(pipeline).wait_for_finished()

        assert len(wiretap.output_records) == len(ROWS_TO_INSERT),\
            f'Expected {len(ROWS_TO_INSERT)} records, received {len(wiretap.output_records)}'
        rows_from_wiretap = get_rows_from_wiretap(wiretap)
        assert rows_from_wiretap == ROWS_TO_INSERT
    finally:
        if dataset_ref:
            bigquery_client.delete_dataset(dataset_ref, delete_contents=True)
Exemplo n.º 25
0
 def copy_vocab_tables(cls, vocabulary_id):
     """
     A function for copying the vocab tables to the test dataset_id
     :param vocabulary_id: 
     :return: 
     """
     # Copy vocab tables over to the test dataset
     vocabulary_dataset = cls.client.get_dataset(vocabulary_id)
     for src_table in bq.list_tables(cls.client, vocabulary_dataset):
         schema = bq.get_table_schema(src_table.table_id)
         destination = f'{cls.project_id}.{cls.dataset_id}.{src_table.table_id}'
         dst_table = cls.client.create_table(Table(destination,
                                                   schema=schema),
                                             exists_ok=True)
         cls.client.copy_table(src_table, dst_table)
Exemplo n.º 26
0
    def get_table(self, table_ref, retry=None):
        # type: (TableReference, Optional[Retry]) -> Table
        """Looks up a table.

        Args:
            table_ref: Table reference to find.
            retry: If provided, what retry strategy to use (unused in this implementation).

        Returns:
            Table found.
        """
        del retry  # Unused in this implementation.
        typed_dataframe = self._safe_lookup(table_ref.project,
                                            table_ref.dataset_id,
                                            table_ref.table_id)
        return Table(table_ref, typed_dataframe.to_bq_schema())
Exemplo n.º 27
0
def createTable(tablename, global_dataset_ref):

    schema = [
        #Enter Schema here.
        # SchemaField('url', 'STRING', mode='required'),
        # SchemaField('views', 'INTEGER', mode='required')
    ]

    table_ref = global_dataset_ref.table(tablename)

    table = Table(table_ref, schema=schema)
    table = bigquery_client.create_table(table)

    assert table.table_id == tablename

    return
Exemplo n.º 28
0
    def setUp(self):
        self.project_id = os.environ.get(PROJECT_ID)
        self.dataset_id = os.environ.get('COMBINED_DATASET_ID')
        self.dataset_ref = DatasetReference(self.project_id, self.dataset_id)
        self.client = bq.get_client(self.project_id)

        self.schema = [
            SchemaField("person_id", "INT64"),
            SchemaField("first_name", "STRING"),
            SchemaField("last_name", "STRING"),
            SchemaField("algorithm", "STRING")
        ]

        self.ps_api_fields = [
            dict(name='person_id', type='integer', mode='nullable'),
            dict(name='first_name', type='string', mode='nullable'),
            dict(name='last_name', type='string', mode='nullable')
        ]

        self.id_match_fields = [
            dict(name='person_id', type='integer', mode='nullable'),
            dict(name='first_name', type='string', mode='nullable'),
            dict(name='last_name', type='string', mode='nullable'),
            dict(name='algorithm', type='string', mode='nullable')
        ]

        self.hpo_id = 'fake_site'
        self.id_match_table_id = f'{IDENTITY_MATCH_TABLE}_{self.hpo_id}'
        self.ps_values_table_id = f'ps_api_values_{self.hpo_id}'

        # Create and populate the ps_values site table

        schema = bq.get_table_schema(PS_API_VALUES)
        tablename = self.ps_values_table_id

        table = Table(f'{self.project_id}.{self.dataset_id}.{tablename}',
                      schema=schema)
        table.time_partitioning = TimePartitioning(
            type_=TimePartitioningType.HOUR)
        table = self.client.create_table(table)

        populate_query = POPULATE_PS_VALUES.render(
            project_id=self.project_id,
            drc_dataset_id=self.dataset_id,
            ps_values_table_id=self.ps_values_table_id)
        job = self.client.query(populate_query)
        job.result()
Exemplo n.º 29
0
    def create(self, table_id, schema):
        """ Create a table in Google BigQuery given a table and schema

        Parameters
        ----------
        table : str
            Name of table to be written
        schema : str
            Use the generate_bq_schema to generate your table schema from a
            dataframe.
        """
        from google.cloud.bigquery import SchemaField
        from google.cloud.bigquery import Table

        if self.exists(table_id):
            raise TableCreationError(
                "Table {0} already " "exists".format(table_id)
            )

        if not _Dataset(self.project_id, credentials=self.credentials).exists(
            self.dataset_id
        ):
            _Dataset(
                self.project_id,
                credentials=self.credentials,
                location=self.location,
            ).create(self.dataset_id)

        table_ref = self.client.dataset(self.dataset_id).table(table_id)
        table = Table(table_ref)

        # Manually create the schema objects, adding NULLABLE mode
        # as a workaround for
        # https://github.com/GoogleCloudPlatform/google-cloud-python/issues/4456
        for field in schema["fields"]:
            if "mode" not in field:
                field["mode"] = "NULLABLE"

        table.schema = [
            SchemaField.from_api_repr(field) for field in schema["fields"]
        ]

        try:
            self.client.create_table(table)
        except self.http_error as ex:
            self.process_http_error(ex)
Exemplo n.º 30
0
def send_to_bq(dataset, table, row):
    bigquery_client = bigquery.Client(project='theo-home')

    table_ref = TableReference(
        dataset_ref=DatasetReference(dataset_id=dataset, project='theo-home'),
        table_id=table,
    )

    schema = [SchemaField(name=field, field_type=bq_types[type(data)]) for field, data in row.items()]

    table = bigquery_client.create_table(
        Table(table_ref, schema=schema),
        exists_ok=True
    )

    errors = bigquery_client.insert_rows(table, [row])
    if errors:
        print(errors, file=sys.stderr)