def test_bigquery_write(mocker, uploader):
  bq_client = mocker.MagicMock()

  mocker.patch.object(uploader, "_get_bq_client")
  uploader._get_bq_client.return_value = bq_client

  table = mocker.MagicMock()
  bq_client.get_table.return_value = table

  now = datetime.datetime.now().timestamp()

  account_config = AccountConfig("account_id", False, "ga_account_id", "", "")
  destination = Destination(
      "dest1",
      DestinationType.GA_MEASUREMENT_PROTOCOL,
      ["web_property", "view", "c", "list", "d", "buyers_custom_dim"])
  source = Source("orig1", SourceType.BIG_QUERY, ["dt1", "buyers"])
  execution = Execution(account_config, source, destination)

  uploader._do_process(Batch(execution, [{"uuid": "uuid-1"}, {"uuid": "uuid-2"}]), now)

  bq_client.insert_rows.assert_called_once_with(
      table,
      [{"uuid": "uuid-1", "timestamp": now},
       {"uuid": "uuid-2", "timestamp": now}],
      (SchemaField("uuid", "string"),
       SchemaField("timestamp", "timestamp")))
示例#2
0
def test_check_field_type_fail():
    local_schema = [SchemaField("a", "STRING", "REQUIRED")]
    remote_schema = [SchemaField("a", "INTEGER", "REQUIRED")]
    assert list(check_schemas(local_schema, remote_schema)) == [
        "There is difference between SchemaField('a', 'STRING', 'REQUIRED', None, (), None) "
        "and SchemaField('a', 'INTEGER', 'REQUIRED', None, (), None)",
    ]
def column_type(name, schema_property):
    safe_name = safe_column_name(name, quotes=False)
    property_type = schema_property['type']
    property_format = schema_property.get('format', None)

    if 'array' in property_type:
        try:
            items_schema = schema_property['items']
            items_type = bigquery_type(items_schema['type'],
                                       items_schema.get('format', None))
        except KeyError:
            return SchemaField(safe_name, 'string', 'NULLABLE')
        else:
            result_type = items_type
            return SchemaField(safe_name, items_type, 'REPEATED')

    elif 'object' in property_type:
        fields = [
            column_type(col, t)
            for col, t in schema_property.get('properties', {}).items()
        ]
        if fields:
            return SchemaField(safe_name, 'RECORD', 'NULLABLE', fields=fields)
        else:
            return SchemaField(safe_name, 'string', 'NULLABLE')

    else:
        result_type = bigquery_type(property_type, property_format)
        return SchemaField(safe_name, result_type, 'NULLABLE')
def main(gcp_credentials_filepath: str):
    gcp_credentials, project_id = load_gcp_credentials_and_project_from_file(
        gcp_credentials_filepath)
    bq_client = create_bq_client(gcp_credentials, project_id)

    dataset = get_bq_dataset(bq_client, 'dataset_x_1')

    # https://cloud.google.com/bigquery/docs/reference/standard-sql/data-types
    table_schema = [
        SchemaField('field_INT64', 'INT64', 'NULLABLE'),
        SchemaField('field_NUMERIC', 'NUMERIC', 'NULLABLE'),
        SchemaField('field_FLOAT64', 'FLOAT64', 'NULLABLE'),
        SchemaField('field_BOOL', 'BOOL', 'NULLABLE'),
        SchemaField('field_STRING', 'STRING', 'NULLABLE'),
        SchemaField('field_BYTES', 'BYTES', 'NULLABLE'),
        SchemaField('field_DATE', 'DATE', 'NULLABLE'),
        SchemaField('field_DATETIME', 'DATETIME', 'NULLABLE'),
        SchemaField('field_TIME', 'TIME', 'NULLABLE'),
        SchemaField('field_TIMESTAMP', 'TIMESTAMP', 'NULLABLE'),
    ]

    table = create_bq_table(
        bq_client, dataset, table_id='table_2', table_schema=table_schema,
        table_description='some table description')

    print("table fully-qualified ID:", table.full_table_id)
    print("table GRN:", table.path)
    print("table description:", table.description)
    print("table type:", table.table_type)
    print("table size [MiB]:", table.num_bytes / 1014)
    print("table size [rows]:", table.num_rows)
    print("table schema fields:")
    for _field in table.schema:
        print("\t", _field)
    print("table API representation:", table.to_api_repr())
示例#5
0
    def test_to_schema_field(self):
        expected_schema_field1 = SchemaField(name='test',
                                             field_type='STRING',
                                             mode='NULLABLE',
                                             description='test_description')
        schema_field1_1 = BigQuerySchemaField('test', 'STRING', 'NULLABLE',
                                              'test_description')
        actual_schema_field1_1 = BigQuerySchemaField.to_schema_field(
            schema_field1_1)
        self.assertEqual(expected_schema_field1, actual_schema_field1_1)
        schema_field1_2 = BigQuerySchemaField('test', 'INTEGER', 'REQUIRED',
                                              'foo_bar')
        actual_schema_field1_2 = BigQuerySchemaField.to_schema_field(
            schema_field1_2)
        self.assertNotEqual(expected_schema_field1, actual_schema_field1_2)

        expected_schema_field2 = SchemaField(name='test',
                                             field_type='RECORD',
                                             mode='NULLABLE',
                                             description='test_description',
                                             fields=(expected_schema_field1, ))
        schema_field2_1 = BigQuerySchemaField('test', 'RECORD', 'NULLABLE',
                                              'test_description',
                                              (schema_field1_1, ))
        actual_schema_field2_1 = BigQuerySchemaField.to_schema_field(
            schema_field2_1)
        self.assertEqual(expected_schema_field2, actual_schema_field2_1)
        schema_field2_2 = BigQuerySchemaField('test', 'RECORD', 'NULLABLE',
                                              'test_description',
                                              (schema_field1_2, ))
        actual_schema_field2_2 = BigQuerySchemaField.to_schema_field(
            schema_field2_2)
        self.assertNotEqual(expected_schema_field2, actual_schema_field2_2)
示例#6
0
def build_schema(schema,
                 key_properties=None,
                 add_metadata=True,
                 force_fields={}):
    SCHEMA = []

    required_fields = set(key_properties) if key_properties else set()
    if "required" in schema:
        required_fields.update(schema["required"])

    for key, props in schema["properties"].items():

        if key in force_fields:
            SCHEMA.append(
                SchemaField(key, force_fields[key]["type"],
                            force_fields[key].get("mode", "nullable"),
                            force_fields[key].get("description", None), ()))
        elif not props:
            # if we end up with an empty record.
            continue

        else:
            SCHEMA.append(
                define_schema(props,
                              bigquery_transformed_key(key),
                              required_fields=required_fields))

    if add_metadata:
        for field in METADATA_FIELDS:
            SCHEMA.append(
                SchemaField(field, METADATA_FIELDS[field]["bq_type"],
                            "nullable", None, ()))

    return SCHEMA
示例#7
0
    def build_schema(self) -> Union[List[SchemaField], None]:
        """
        Attempts to build the schema that will be used for table creation

        Iterates through the dtypes items and generate the appropriate SchemaFields

        Returns:
            The schema generated if successful otherwise None
        """
        schema = None
        if self.dtypes is not None:
            schema = []
            dtypes = self.dtypes.apply(lambda x: x.name).to_dict()
            for key, value in dtypes.items():
                if value == "string":
                    schema.append(SchemaField(key, "STRING"))
                elif value == "bool":
                    schema.append(SchemaField(key, "BOOL"))
                elif value == "Int64" or value == "int64":
                    schema.append(SchemaField(key, "INTEGER"))
                elif value == "float64":
                    schema.append(SchemaField(key, "FLOAT"))
                elif value == "datetime64[ns]":
                    schema.append(SchemaField(key, "DATETIME"))
        return schema
示例#8
0
def test_check_field_type_unspecified():
    local_schema = [SchemaField("a", "RANDOM_FIELD_TYPE", "REQUIRED")]
    remote_schema = [SchemaField("a", "INTEGER", "REQUIRED")]
    assert list(check_schemas(local_schema, remote_schema)) == [
        "Unspecified field type in SchemaField('a', 'RANDOM_FIELD_TYPE', 'REQUIRED', None, ()) "
        "or SchemaField('a', 'INTEGER', 'REQUIRED', None, ())"
    ]
def _field_to_schema(field: Field) -> SchemaField:
    field_type = _BASIC_TYPES_TO_NAME.get(field.type)
    if field_type:
        return SchemaField(
            name=field.name,
            field_type=field_type,
            description=_parse_field_description(field),
            mode=BigQueryFieldModes.REQUIRED,
        )

    if is_dataclass(field.type):
        return SchemaField(
            name=field.name,
            field_type=BigQueryTypes.STRUCT,
            mode=BigQueryFieldModes.REQUIRED,
            description=_parse_field_description(field),
            fields=_parse_fields(field.type),
        )

    # typing.Optional is the same as typing.Union[SomeType, NoneType]
    if field.type.__origin__ is Union:
        return _parse_optional(field)

    if field.type.__origin__ is list:
        return _parse_list(field)

    raise TypeError(f"Unsupported type: {field.type}.")
示例#10
0
def main():
    # Preamble - gets some creds and a valid Google BigQuery Client object.
    gbq_creds_path = Path("~/.gbq-key.json").expanduser()
    client = Client.from_service_account_json(gbq_creds_path)

    # Example of df_to_table().
    schema = [
        SchemaField('field1', 'float64', 'REQUIRED'),
        SchemaField('field2', 'float64', 'REQUIRED'),
        SchemaField('field3', 'float64', 'REQUIRED'),
        SchemaField('field4', 'float64', 'REQUIRED')
    ]
    table = client.dataset('my_dataset').table('example_table_1', schema)
    df1 = pd.DataFrame(np.random.rand(10, 4))
    gbq.df_to_table(df1, table)

    # Example table_to_df().
    dataset = client.dataset('my_dataset')
    table = dataset.table('example_table_1')
    df2 = gbq.table_to_df(table)
    df2.info()

    # Example of query_to_df().
    query = 'select field1, field2 from my_dataset.example_table_1;'
    df3 = gbq.query_to_df(query, client)
    df3.info()
示例#11
0
    def create_table(self, dataset: str, table_id: str) -> bigquery.Table:
        dataset_ref = self.bq_client.dataset(dataset)
        table_ref = dataset_ref.table(table_id)

        schema = [
            SchemaField("image_id", SqlTypeNames.INTEGER, mode="REQUIRED"),
            SchemaField("filename", SqlTypeNames.STRING, mode="REQUIRED"),
            SchemaField("category_id", SqlTypeNames.INTEGER, mode="REQUIRED"),
            SchemaField("score", SqlTypeNames.FLOAT, mode="REQUIRED"),
            SchemaField(
                "segmentation",
                SqlTypeNames.RECORD,
                mode="REQUIRED",
                fields=[
                    SchemaField("size", SqlTypeNames.INTEGER, mode="REPEATED"),
                    SchemaField("counts", SqlTypeNames.STRING,
                                mode="REQUIRED"),
                ],
            ),
            SchemaField("bbox", SqlTypeNames.FLOAT, mode="REPEATED"),
            SchemaField("mask_area_fraction",
                        SqlTypeNames.FLOAT,
                        mode="REQUIRED"),
            SchemaField("mask_mean_score", SqlTypeNames.FLOAT,
                        mode="REQUIRED"),
        ]

        table = bigquery.Table(table_ref, schema=schema)
        table = self.bq_client.create_table(table, exists_ok=True)
        return table
示例#12
0
def test_check_schemas():
    local_schema = [
        SchemaField("a", "RECORD", "REQUIRED", fields=[SchemaField("b", "INTEGER")])
    ]
    remote_schema = [
        SchemaField("a", "RECORD", "REQUIRED", fields=[SchemaField("b", "INTEGER")])
    ]
    assert list(check_schemas(local_schema, remote_schema)) == []
示例#13
0
def test_check_removed_local_nested_columns():
    local_schema = [SchemaField("a", "RECORD", "REQUIRED", fields=[])]
    remote_schema = [
        SchemaField("a", "RECORD", "REQUIRED", fields=[SchemaField("b", "INTEGER")])
    ]
    assert list(check_schemas(local_schema, remote_schema)) == [
        "Nested: Removed column SchemaField('b', 'INTEGER', 'NULLABLE', None, (), None)"
    ]
示例#14
0
def test_find_new_columns_missing_column_in_record():
    local_schema = [
        SchemaField("a", "RECORD", fields=[SchemaField("b", "INTEGER")])
    ]
    remote_schema = [SchemaField("a", "RECORD", fields=[])]
    assert list(find_new_columns(local_schema, remote_schema)) == [
        SchemaField("b", "INTEGER", "NULLABLE", None, ())
    ]
示例#15
0
def test_find_new_columns_missing_column_in_record():
    local_schema = [
        SchemaField("a", "RECORD", fields=[SchemaField("b", "INTEGER")])
    ]
    remote_schema = [SchemaField("a", "RECORD", fields=[])]
    assert list(check_schemas(local_schema, remote_schema)) == [
        "Nested: New column SchemaField('b', 'INTEGER', 'NULLABLE', None, ())"
    ]
def test_data_type(sdc_builder, sdc_executor, gcp, data_type, data,
                   expected_data):
    """
    Create data using Google BigQuery client
    and then check if Google BigQuery origin receives them using wiretap with one data format each.

    The pipeline looks like:
        google_bigquery >> wiretap
    """
    pipeline_builder = sdc_builder.get_pipeline_builder()

    dataset_name = get_random_string(string.ascii_letters, 5)
    table_name = get_random_string(string.ascii_letters, 5)
    google_bigquery = pipeline_builder.add_stage('Google BigQuery',
                                                 type='origin')
    query_str = f'SELECT * FROM {dataset_name}.{table_name}'
    google_bigquery.set_attributes(query=query_str)

    wiretap = pipeline_builder.add_wiretap()

    google_bigquery >> wiretap.destination

    pipeline = pipeline_builder.build().configure_for_environment(gcp)
    sdc_executor.add_pipeline(pipeline)

    bigquery_client = gcp.bigquery_client
    schema = [
        SchemaField('id', 'INTEGER', mode='NULLABLE'),
        SchemaField('data', data_type, mode='NULLABLE')
    ]

    try:
        dataset_ref = Dataset(bigquery_client.dataset(dataset_name))

        # Using Google bigquery client, create dataset, table and data inside table
        logger.info('Creating dataset %s using Google bigquery client ...',
                    dataset_name)
        bigquery_client.create_dataset(dataset_ref)
        table = bigquery_client.create_table(
            Table(dataset_ref.table(table_name), schema=schema))
        errors = bigquery_client.insert_rows(table, [(None, data)])
        assert not errors, 'Errors found when creating data using bigquery client'

        # Start pipeline and verify correct rows are received.
        logger.info('Starting pipeline')

        sdc_executor.start_pipeline(pipeline).wait_for_finished()

        assert len(
            wiretap.output_records
        ) == 1, f'Expected 1 record, received {len(wiretap.output_records)}'
        received_data = wiretap.output_records[0].field['data']

        assert received_data == expected_data
    finally:
        if dataset_ref:
            bigquery_client.delete_dataset(dataset_ref, delete_contents=True)
示例#17
0
def handle_record_type(safe_name, schema_property, mode="NULLABLE"):
    fields = [
        column_schema(col, t)
        for col, t in schema_property.get('properties', {}).items()
    ]
    if fields:
        return SchemaField(safe_name, 'RECORD', mode, fields=fields)
    else:
        return SchemaField(safe_name, 'string', mode)
示例#18
0
def test_google_bigquery_origin_batch_handling(sdc_builder, sdc_executor, gcp):
    """Verify proper batch handling by the BigQuery origin.

    In this test, we write 8 records to BigQuery with a batch size of 3, verifying that each batch is of
    size 3 and that 8 total records are captured. The small numbers are used because of the limitations
    described in SDC-8765.
    """
    MAX_BATCH_SIZE = 3
    TOTAL_RECORDS = 8

    dataset_name = get_random_string(ascii_letters, 5)
    table_name = get_random_string(ascii_letters, 5)
    query_str = f'SELECT * FROM {dataset_name}.{table_name} ORDER BY number'

    pipeline_builder = sdc_builder.get_pipeline_builder()
    google_bigquery = pipeline_builder.add_stage('Google BigQuery', type='origin')
    google_bigquery.set_attributes(query=query_str, max_batch_size_in_records=MAX_BATCH_SIZE)
    trash = pipeline_builder.add_stage('Trash')
    google_bigquery >> trash

    pipeline = pipeline_builder.build().configure_for_environment(gcp)
    sdc_executor.add_pipeline(pipeline)

    bigquery_client = gcp.bigquery_client
    schema = [SchemaField('number', 'STRING', mode='required'),
              SchemaField('value', 'STRING', mode='required')]

    data = [dict(number=str(i), value=get_random_string()) for i in range(TOTAL_RECORDS)]
    dataset = Dataset(bigquery_client.dataset(dataset_name))

    try:
        # Using Google bigquery client, create dataset, table and data inside table.
        logger.info('Creating dataset %s using Google bigquery client ...', dataset_name)
        bigquery_client.create_dataset(dataset)
        table = bigquery_client.create_table(Table(dataset.table(table_name), schema=schema))
        errors = bigquery_client.insert_rows(table, data)
        assert not errors

        # Ceiling division is needed to capture all the complete batches along with the last partial one.
        snapshot = sdc_executor.capture_snapshot(pipeline,
                                                 start_pipeline=True,
                                                 batches=math.ceil(TOTAL_RECORDS / MAX_BATCH_SIZE)).snapshot

        # Assert that the batch size is being respected in each batch (including the last). In this case,
        # we'd expect batch sizes of 3, 3, and 2.
        for i, batch in enumerate(snapshot.snapshot_batches, start=1):
            #  for 8 records, we'd expect batch sizes of 3, 3, and 2.
            assert (len(batch.stage_outputs[google_bigquery.instance_name].output) == MAX_BATCH_SIZE
                    if i * MAX_BATCH_SIZE <= TOTAL_RECORDS
                    else TOTAL_RECORDS % MAX_BATCH_SIZE)

        all_records = [record.field
                       for batch in snapshot.snapshot_batches
                       for record in batch.stage_outputs[google_bigquery.instance_name].output]
        assert all_records == data
    finally:
        bigquery_client.delete_dataset(dataset, delete_contents=True)
示例#19
0
def create_bigquery_dataset(ds_name, table_name, data):
    """
    Creates a dataset for the twits and fills it up with twit data, if it does not exist yet
    :param ds_name: dataset name in BigQuery
    :param table_name: talbe name for twit data within the dataset
    :param data: data to be uploaded
    :return:
    """
    # Define BigQuery schema
    schema = [
        SchemaField('Sentiment', 'INT64', mode='required'),
        SchemaField('SentimentText', 'STRING', mode='required'),
        SchemaField('language', 'STRING', mode='required'),
        SchemaField('twit_length', 'INT64', mode='required'),
        SchemaField('Training', 'BOOLEAN', mode='required')
    ]
    client = bigquery.Client()

    # Get or create the dataset
    dataset_ref = client.dataset(ds_name)
    dataset = bigquery.Dataset(dataset_ref)
    try:
        dataset = client.get_dataset(dataset)
        print('Dataset present')
    except:
        print('Dataset not found, creating...')
        dataset.location = 'EU'
        dataset = client.create_dataset(dataset)

    # Create table if it does not exist
    tables = list(client.list_tables(dataset_ref))
    table_ref = dataset_ref.table(table_name)
    table = bigquery.Table(table_ref, schema=schema)
    if table_name in [t.table_id for t in tables]:
        print('Table of twits already created')
        table = client.get_table(table_ref)
    else:
        print('Table twits not found, creating...')
        table = client.create_table(table)

    # Fill in table data
    print('Filling table twits...')
    rows = [tuple(r[1:]) for r in data.values]

    def _batch(iterable, n=1):
        l = len(iterable)
        for ndx in range(0, l, n):
            yield ndx, iterable[ndx:min(ndx + n, l)]

    # Iterate in batches to not exceed payload size limit
    for batch_n, batch in _batch(rows, n=1000):
        print('Uploading batch {}'.format(str(batch_n)))
        errors = client.insert_rows(table, batch)
        if errors:
            print('There were errors copying batch {}: {}'.format(
                str(batch_n, str(errors))))
示例#20
0
def test_google_bigquery_destination(sdc_builder, sdc_executor, gcp):
    """
    Send data to Google BigQuery from Dev Raw Data Source and
    confirm that Google BigQuery destination successfully recieves them using Google BigQuery client.

    This is achieved by using a deduplicator which assures that there is only one ingest to Google BigQuery.
    The pipeline looks like:
        dev_raw_data_source >> record_deduplicator >> google_bigquery
                               record_deduplicator >> trash
    """
    pipeline_builder = sdc_builder.get_pipeline_builder()

    dev_raw_data_source = pipeline_builder.add_stage('Dev Raw Data Source')
    dev_raw_data_source.set_attributes(data_format='DELIMITED',
                                       header_line='WITH_HEADER',
                                       raw_data='\n'.join(CSV_DATA_TO_INSERT))

    dataset_name = get_random_string(ascii_letters, 5)
    table_name = get_random_string(ascii_letters, 5)
    google_bigquery = pipeline_builder.add_stage('Google BigQuery', type='destination')
    google_bigquery.set_attributes(dataset=dataset_name,
                                   table_name=table_name)

    record_deduplicator = pipeline_builder.add_stage('Record Deduplicator')
    trash = pipeline_builder.add_stage('Trash')

    dev_raw_data_source >> record_deduplicator >> google_bigquery
    record_deduplicator >> trash

    pipeline = pipeline_builder.build(title='Google BigQuery Destination').configure_for_environment(gcp)
    sdc_executor.add_pipeline(pipeline)

    bigquery_client = gcp.bigquery_client
    schema = [SchemaField('full_name', 'STRING', mode='required'),
              SchemaField('age', 'INTEGER', mode='required')]
    dataset_ref = Dataset(bigquery_client.dataset(dataset_name))

    try:
        logger.info('Creating dataset %s using Google BigQuery client ...', dataset_name)
        dataset = bigquery_client.create_dataset(dataset_ref)
        table = bigquery_client.create_table(Table(dataset_ref.table(table_name), schema=schema))

        logger.info('Starting BigQuery Destination pipeline and waiting for it to produce records ...')
        sdc_executor.start_pipeline(pipeline).wait_for_pipeline_batch_count(1)

        logger.info('Stopping BigQuery Destination pipeline and getting the count of records produced in total ...')
        sdc_executor.stop_pipeline(pipeline)

        # Verify by reading records using Google BigQuery client
        data_from_bigquery = [tuple(row.values()) for row in bigquery_client.list_rows(table)]
        data_from_bigquery.sort()
        logger.debug('read_data = {}'.format(data_from_bigquery))
        assert ROWS_EXPECTED == data_from_bigquery
    finally:
        bigquery_client.delete_dataset(dataset_ref, delete_contents=True)
示例#21
0
def test_object_names_columns(sdc_builder, sdc_executor, gcp, column_name):
    """
    Create data using Google BigQuery destination with different column names
    and then check if it has been correctly created using BigQuery client.

    The pipeline looks like:
        dev_raw_data_source >> google_bigquery
    """
    data_to_insert = [(1, 'data')]
    rows_to_insert = [f'id,{column_name}'] + [','.join(str(element) for element in row) for row in data_to_insert]
    data_to_expect = [(1, 'data')]
    pipeline_builder = sdc_builder.get_pipeline_builder()

    dev_raw_data_source = pipeline_builder.add_stage('Dev Raw Data Source')
    dev_raw_data_source.set_attributes(data_format='DELIMITED',
                                       header_line='WITH_HEADER',
                                       stop_after_first_batch=True,
                                       raw_data='\n'.join(rows_to_insert))

    dataset_name = get_random_string(string.ascii_letters, 5)
    table_name = get_random_string(string.ascii_letters, 5)

    google_bigquery = pipeline_builder.add_stage(name=DESTINATION_STAGE_NAME, type='destination')
    google_bigquery.set_attributes(dataset=dataset_name,
                                   table_name=table_name)

    dev_raw_data_source >> google_bigquery

    pipeline = pipeline_builder.build().configure_for_environment(gcp)
    sdc_executor.add_pipeline(pipeline)

    bigquery_client = gcp.bigquery_client
    schema = [SchemaField('id', 'INTEGER', mode='NULLABLE'),
              SchemaField(column_name, 'STRING', mode='NULLABLE')]

    try:
        dataset_ref = Dataset(bigquery_client.dataset(dataset_name))

        # Using Google bigquery client, create dataset, table and data inside table
        logger.info('Creating dataset %s using Google bigquery client ...', dataset_name)
        bigquery_client.create_dataset(dataset_ref)
        table = bigquery_client.create_table(Table(dataset_ref.table(table_name), schema=schema))

        # Start pipeline and verify correct rows are received.
        logger.info('Starting pipeline')
        sdc_executor.start_pipeline(pipeline).wait_for_finished()

        data_from_bigquery = [tuple(row.values()) for row in bigquery_client.list_rows(table)]
        data_from_bigquery.sort()

        assert data_to_expect == data_from_bigquery
    finally:
        if dataset_ref:
            bigquery_client.delete_dataset(dataset_ref, delete_contents=True)
 def __create_test_table(self, table_name, dataset_id):
     table_schema = [
         SchemaField("int_data", "INT64"),
         SchemaField("str_data", "STRING")
     ]
     table_reference = TableReference(dataset_id, table_name)
     test_table = Table(table_reference, table_schema)
     test_table.time_partitioning = TimePartitioning("DAY")
     self.__delete_if_exists(test_table)
     self.GCP_BIGQUERY_CLIENT.create_table(test_table)
     return test_table
示例#23
0
def test_google_bigquery_origin(sdc_builder, sdc_executor, gcp):
    """
    Create data using Google BigQuery client
    and then check if Google BigQuery origin receives them using snapshot.

    The pipeline looks like:
        google_bigquery >> trash
    """
    pipeline_builder = sdc_builder.get_pipeline_builder()

    dataset_name = get_random_string(ascii_letters, 5)
    table_name = get_random_string(ascii_letters, 5)
    google_bigquery = pipeline_builder.add_stage('Google BigQuery',
                                                 type='origin')
    query_str = f'SELECT * FROM {dataset_name}.{table_name} ORDER BY full_name'
    google_bigquery.set_attributes(query=query_str)

    trash = pipeline_builder.add_stage('Trash')
    google_bigquery >> trash
    pipeline = pipeline_builder.build(
        title='Google BigQuery').configure_for_environment(gcp)
    sdc_executor.add_pipeline(pipeline)

    bigquery_client = gcp.bigquery_client
    schema = [
        SchemaField('full_name', 'STRING', mode='required'),
        SchemaField('age', 'INTEGER', mode='required')
    ]
    dataset_ref = Dataset(bigquery_client.dataset(dataset_name))
    try:
        # Using Google bigquery client, create dataset, table and data inside table
        logger.info('Creating dataset %s using Google bigquery client ...',
                    dataset_name)
        dataset = bigquery_client.create_dataset(dataset_ref)
        table = bigquery_client.create_table(
            Table(dataset_ref.table(table_name), schema=schema))
        errors = bigquery_client.insert_rows(table, ROWS_TO_INSERT)
        assert errors == []

        # Start pipeline and verify correct rows are received using snaphot.
        logger.info('Starting pipeline and snapshot')
        snapshot = sdc_executor.capture_snapshot(pipeline,
                                                 start_pipeline=True).snapshot
        if sdc_executor.get_pipeline_status(pipeline) == 'RUNNING':
            sdc_executor.stop_pipeline(pipeline)
        rows_from_snapshot = [(record.value['value'][0]['value'],
                               int(record.value['value'][1]['value']))
                              for record in snapshot[google_bigquery].output]

        assert rows_from_snapshot == ROWS_TO_INSERT
    finally:
        bigquery_client.delete_dataset(dataset_ref, delete_contents=True)
示例#24
0
def test_check_mode_fail():
    local_schema = [
        SchemaField("a", "RECORD", "REQUIRED", fields=[SchemaField("b", "INTEGER")])
    ]
    remote_schema = [
        SchemaField("a", "RECORD", "NULLABLE", fields=[SchemaField("b", "INTEGER")])
    ]
    assert list(check_schemas(local_schema, remote_schema)) == [
        "There is difference between SchemaField('a', 'RECORD', 'REQUIRED', None, "
        "(SchemaField('b', 'INTEGER', 'NULLABLE', None, (), None),), None) and "
        "SchemaField('a', 'RECORD', 'NULLABLE', None, (SchemaField('b', 'INTEGER', "
        "'NULLABLE', None, (), None),), None)",
    ]
示例#25
0
def test_sdk_representation_conversion_record() -> None:
    schema_out = [
        SchemaField('att_01', 'INT64', 'REQUIRED', 'Att 1', ()),
        SchemaField('att_02', 'RECORD', 'NULLABLE', 'Att 2', (SchemaField(
            'att_11', 'FLOAT64', 'REQUIRED', None,
            ()), SchemaField('att_12', 'STRING', 'NULLABLE', None, ())))
    ]

    schema_convert = module.sdk_representation(schema_in_record)

    assert schema_convert == schema_out,\
        "Convertion doesn't work"

    return
示例#26
0
def test_check_field_type_fail_nested():
    local_schema = [
        SchemaField(
            "a", "RECORD", "REQUIRED", fields=[SchemaField("b", "STRING", "REQUIRED")]
        )
    ]
    remote_schema = [
        SchemaField(
            "a", "RECORD", "REQUIRED", fields=[SchemaField("b", "INTEGER", "REQUIRED")]
        )
    ]
    assert list(check_schemas(local_schema, remote_schema)) == [
        "Nested: There is difference between SchemaField('b', 'STRING', 'REQUIRED', "
        "None, (), None) and SchemaField('b', 'INTEGER', 'REQUIRED', None, (), None)"
    ]
示例#27
0
def upload_to_bq(table_name, dataframe):

    table_id = GCLOUD_PROJECT_ID + '.' + DATASET_BIGQUERY + '.' + table_name
    table_ref = dataset_ref.table(table_name)

    schem = []
    data_df = dataframe.copy()
    data_df['events_loaded'] = False

    for col in users_columns:
        if col == 'ae_total_app_session_length' or col == 'ae_total_app_sessions' or col == 'events_loaded':
            pass
        else:
            data_df[col] = data_df[col].astype(str)

    data_df.columns = data_df.columns.str.replace('$', '')
    data_df.columns = data_df.columns.str.replace('properties.', '')

    if if_tbl_exists(bigquery_client, table_ref):
        table = bigquery_client.get_table(table_ref)  # API Request
        schem = table.schema
    else:
        for column in data_df:
            data_df[column] = data_df[column].apply(
                lambda x: 'DICT' if isinstance(x, dict) else x)
            data_df[column] = data_df[column].apply(
                lambda x: 'LIST' if isinstance(x, list) else x)
            # If it's a nested field we drop it
            if not data_df.loc[data_df[column] == 'DICT'].empty:
                data_df = data_df.drop([column], axis=1)
            elif not data_df.loc[data_df[column] == 'LIST'].empty:
                data_df = data_df.drop([column], axis=1)
            elif data_df[column].dtype == 'int64':
                schem.append(SchemaField(column, 'INTEGER'))
            elif data_df[column].dtype == 'float64':
                schem.append(SchemaField(column, 'FLOAT'))
            elif data_df[column].dtype == 'bool':
                schem.append(SchemaField(column, 'BOOLEAN'))
            elif data_df[column].dtype == 'datetime64[ns]':
                schem.append(SchemaField(column, 'INT64'))
            else:
                schem.append(SchemaField(column, 'STRING'))

    job_config = bigquery.LoadJobConfig(schema=schem)

    job = bigquery_client.load_table_from_dataframe(data_df,
                                                    table_id,
                                                    job_config=job_config)
示例#28
0
def upload_tweets():
    big_query_client = bigquery.Client.from_service_account_json('my-beam-project-b2834963a4ae.json')

    dataset_ref = big_query_client.dataset('Tweets')
    dataset = Dataset(dataset_ref)
    dataset.description = 'This represents tweets of trending topics'
    dataset = big_query_client.create_dataset(dataset)

    SCHEMA = [
        SchemaField('Tweets', 'STRING', mode='Nullable'),

    ]
    table_ref = big_query_client.dataset('Tweets').table('tabletweet')

    load_config = LoadJobConfig()
    load_config.skip_leading_rows = 0
    load_config.schema = SCHEMA
    load_config.allow_quoted_newlines = True
    load_config.ignore_unknown_values = False
    load_config.max_bad_records = 200


    with open('tweets.csv', 'rb') as readable:
        big_query_client.load_table_from_file(
            readable, table_ref, job_config=load_config)
    print('tweets file uploaded to big query')
示例#29
0
def table_update(client, to_delete):
    """Update a table's metadata."""
    DATASET_NAME = 'table_update_dataset_%d' % (_millis(), )
    TABLE_NAME = 'table_update_table_%d' % (_millis(), )
    dataset = client.dataset(DATASET_NAME)
    dataset.description = ORIGINAL_DESCRIPTION
    dataset.create()
    to_delete.append(dataset)

    table = dataset.table(TABLE_NAME, SCHEMA)
    table.friendly_name = ORIGINAL_FRIENDLY_NAME
    table.description = ORIGINAL_DESCRIPTION
    table.create()
    to_delete.insert(0, table)

    # [START table_update]
    assert table.friendly_name == ORIGINAL_FRIENDLY_NAME
    assert table.description == ORIGINAL_DESCRIPTION
    NEW_SCHEMA = table.schema[:]
    NEW_SCHEMA.append(SchemaField('phone', 'string'))
    table.friendly_name = UPDATED_FRIENDLY_NAME
    table.description = UPDATED_DESCRIPTION
    table.schema = NEW_SCHEMA
    table.update()  # API request
    assert table.friendly_name == UPDATED_FRIENDLY_NAME
    assert table.description == UPDATED_DESCRIPTION
    assert table.schema == NEW_SCHEMA
示例#30
0
    def setUp(self) -> None:
        self.mock_project_id = "fake-recidiviz-project"
        self.mock_dataset_id = "fake-dataset"
        self.mock_table_id = "test_table"
        self.mock_table_schema = [
            SchemaField("my_column", "STRING", "NULLABLE", None, ())
        ]
        self.mock_export_uri = "gs://fake-export-uri"
        self.mock_dataset = bigquery.dataset.DatasetReference(
            self.mock_project_id, self.mock_dataset_id)
        self.mock_table = self.mock_dataset.table(self.mock_table_id)

        self.project_id_patcher = mock.patch(
            "recidiviz.utils.metadata.project_id")
        self.project_id_patcher.start().return_value = self.mock_project_id

        self.mock_load_job_patcher = mock.patch(
            "google.cloud.bigquery.job.LoadJob")
        self.mock_load_job = self.mock_load_job_patcher.start()
        self.mock_load_job.destination.return_value = self.mock_table

        Table = collections.namedtuple("Table", ["name"])
        self.tables_to_export = [Table("first_table"), Table("second_table")]
        self.mock_bq_refresh_config = mock.Mock()
        self.mock_query_builder = mock.Mock()
        self.mock_bq_refresh_config.get_tables_to_export.return_value = (
            self.tables_to_export)
        self.mock_bq_refresh_config.get_stale_bq_rows_for_excluded_regions_query_builder.return_value = (
            self.mock_query_builder)

        self.mock_bq_client = create_autospec(BigQueryClient)