def test_create_tables_from_dict(self):
     # type: () -> None
     self.client.create_tables_from_dict({
         'empty_1': [
             SchemaField('col1', 'INTEGER'),
             SchemaField('col2', 'STRING'),
         ],
         'empty_2': [
             SchemaField('col1', 'FLOAT'),
             SchemaField('col2', 'INTEGER'),
         ]
     })
     self.assertEqual([('col1', 'INTEGER', 'NULLABLE'),
                       ('col2', 'STRING', 'NULLABLE')],
                      [(x.name, x.field_type, x.mode)
                       for x in self.client.get_schema(
                           self.default_test_dataset_id, 'empty_1')])
     self.assertEqual([('col1', 'FLOAT', 'NULLABLE'),
                       ('col2', 'INTEGER', 'NULLABLE')],
                      [(x.name, x.field_type, x.mode)
                       for x in self.client.get_schema(
                           self.default_test_dataset_id, 'empty_2')])
示例#2
0
def test_simple_query(bqtk: BQTestKit):
    with bqtk.project("it").dataset("dataset_foo").isolate() as ds:
        schema = [SchemaField("f1", field_type="STRING")]
        with ds.table("table_bar", schema=schema).isolate() as t:
            result = bqtk.query_template(
                from_=f"select count(*) as nb from `{t.fqdn()}`").run()
            assert len(result.schema) == 1
            assert result.schema[0].name == "nb"
            assert str.upper(
                result.schema[0].field_type) in ["INTEGER", "INT64"]
            assert len(result.rows) == 1
            assert result.total_rows == 1
            assert result.rows[0]["nb"] == 0
示例#3
0
    def test_execute_w_query(self):
        from google.cloud.bigquery.schema import SchemaField
        from google.cloud.bigquery import dbapi

        connection = dbapi.connect(self._mock_client(
            rows=[('hello', 'world', 1), ('howdy', 'y\'all', 2)],
            schema=[
                SchemaField('a', 'STRING', mode='NULLABLE'),
                SchemaField('b', 'STRING', mode='REQUIRED'),
                SchemaField('c', 'INTEGER', mode='NULLABLE')]))
        cursor = connection.cursor()
        cursor.execute('SELECT a, b, c FROM hello_world WHERE d > 3;')

        # Verify the description.
        self.assertEqual(len(cursor.description), 3)
        a_name, a_type, _, _, _, _, a_null_ok = cursor.description[0]
        self.assertEqual(a_name, 'a')
        self.assertEqual(a_type, 'STRING')
        self.assertEqual(a_type, dbapi.STRING)
        self.assertTrue(a_null_ok)
        b_name, b_type, _, _, _, _, b_null_ok = cursor.description[1]
        self.assertEqual(b_name, 'b')
        self.assertEqual(b_type, 'STRING')
        self.assertEqual(b_type, dbapi.STRING)
        self.assertFalse(b_null_ok)
        c_name, c_type, _, _, _, _, c_null_ok = cursor.description[2]
        self.assertEqual(c_name, 'c')
        self.assertEqual(c_type, 'INTEGER')
        self.assertEqual(c_type, dbapi.NUMBER)
        self.assertTrue(c_null_ok)

        # Verify the results.
        self.assertEqual(cursor.rowcount, 2)
        row = cursor.fetchone()
        self.assertEqual(row, ('hello', 'world', 1))
        row = cursor.fetchone()
        self.assertEqual(row, ('howdy', 'y\'all', 2))
        row = cursor.fetchone()
        self.assertIsNone(row)
示例#4
0
def test_to_arrow_w_tqdm_wo_query_plan():
    from google.cloud.bigquery import table
    from google.cloud.bigquery.job import QueryJob as target_class
    from google.cloud.bigquery.schema import SchemaField

    begun_resource = _make_job_resource(job_type="query")
    rows = [
        {"f": [{"v": "Bharney Rhubble"}, {"v": "33"}]},
        {"f": [{"v": "Wylma Phlyntstone"}, {"v": "29"}]},
    ]

    schema = [
        SchemaField("name", "STRING", mode="REQUIRED"),
        SchemaField("age", "INTEGER", mode="REQUIRED"),
    ]
    connection = _make_connection({})
    client = _make_client(connection=connection)
    job = target_class.from_api_repr(begun_resource, client)

    path = "/foo"
    api_request = mock.Mock(return_value={"rows": rows})
    row_iterator = table.RowIterator(client, api_request, path, schema)

    reload_patch = mock.patch(
        "google.cloud.bigquery.job._AsyncJob.reload", autospec=True
    )
    result_patch = mock.patch(
        "google.cloud.bigquery.job.QueryJob.result",
        side_effect=[concurrent.futures.TimeoutError, row_iterator],
    )

    with result_patch as result_patch_tqdm, reload_patch:
        tbl = job.to_arrow(progress_bar_type="tqdm", create_bqstorage_client=False)

    assert result_patch_tqdm.call_count == 2
    assert isinstance(tbl, pyarrow.Table)
    assert tbl.num_rows == 2
    result_patch_tqdm.assert_called()
示例#5
0
    def _get_schema_for_field(self, column: str) -> SchemaField:
        field_type = 'STRING'
        field_mode = 'REQUIRED'

        concrete_field_type = self._get_type_for_field(column)

        if date == concrete_field_type:
            field_type = 'DATE'
        elif int == concrete_field_type:
            field_type = 'INT64'
        elif float == concrete_field_type:
            field_type = 'FLOAT64'

        return SchemaField(column, field_type, field_mode)
示例#6
0
    def test_w_description(self):
        from google.cloud.bigquery.schema import SchemaField

        DESCRIPTION = 'DESCRIPTION'
        full_name = SchemaField('full_name',
                                'STRING',
                                mode='REQUIRED',
                                description=DESCRIPTION)
        age = SchemaField('age', 'INTEGER', mode='REQUIRED')
        resource = self._call_fut([full_name, age])
        self.assertEqual(len(resource), 2)
        self.assertEqual(
            resource[0], {
                'name': 'full_name',
                'type': 'STRING',
                'mode': 'REQUIRED',
                'description': DESCRIPTION
            })
        self.assertEqual(resource[1], {
            'name': 'age',
            'type': 'INTEGER',
            'mode': 'REQUIRED'
        })
示例#7
0
    def test_w_description(self):
        from google.cloud.bigquery.schema import SchemaField

        DESCRIPTION = "DESCRIPTION"
        full_name = SchemaField(
            "full_name", "STRING", mode="REQUIRED", description=DESCRIPTION
        )
        age = SchemaField("age", "INTEGER", mode="REQUIRED")
        resource = self._call_fut([full_name, age])
        self.assertEqual(len(resource), 2)
        self.assertEqual(
            resource[0],
            {
                "name": "full_name",
                "type": "STRING",
                "mode": "REQUIRED",
                "description": DESCRIPTION,
            },
        )
        self.assertEqual(
            resource[1],
            {"name": "age", "type": "INTEGER", "mode": "REQUIRED", "description": None},
        )
示例#8
0
def test_query_write_mode(bqtk: BQTestKit):
    with bqtk.project("it").dataset("dataset_foo").isolate() as ds:
        source_schema = [SchemaField("f1", field_type="STRING")]
        target_schema = [
            SchemaField("f1", field_type="STRING"),
            SchemaField("f2", field_type="INT64")
        ]
        with ds.table("table_bar", schema=source_schema).isolate() as t_source:
            with ds.table("table_foobar",
                          schema=target_schema).isolate() as t_target:
                bqtk.query_template(from_=f"select count(*) as f2, 'test' as f1 from `{t_source.fqdn()}`") \
                    .with_destination(t_target).run()
                result = bqtk.query_template(
                    from_=f"select * from `{t_target.fqdn()}` order by f2"
                ).run()
                assert result.rows == [{"f1": "test", "f2": 0}]
                bqtk.query_template(from_="select 'test2' as f1, 2 as f2") \
                    .with_destination(t_target).append().run()
                result = bqtk.query_template(
                    from_=f"select * from `{t_target.fqdn()}` order by f2"
                ).run()
                assert result.rows == [{
                    "f1": "test",
                    "f2": 0
                }, {
                    "f1": "test2",
                    "f2": 2
                }]
                bqtk.query_template(from_="select 'test3' as f1, 3 as f2") \
                    .with_destination(t_target).overwrite().run()
                result = bqtk.query_template(
                    from_=f"select * from `{t_target.fqdn()}` order by f2"
                ).run()
                assert result.rows == [{"f1": "test3", "f2": 3}]
                with pytest.raises(Exception):
                    bqtk.query_template(from_="select 'test4' as f1, 4 as f2") \
                        .with_destination(t_target).error_if_exists().run()
    def test_insert_rows(self):
        # type: () -> None
        dataset_ref = DatasetReference('my_project', 'my_dataset')
        dataset = Dataset(dataset_ref)
        table1_ref = TableReference(dataset_ref, 'table1')
        schema = [SchemaField(name="a", field_type='INT64'),
                  SchemaField(name="b", field_type='FLOAT64'),
                  ]
        table = Table(table1_ref, schema)
        self.bq_client.create_dataset(dataset)
        self.bq_client.create_table(table)

        # Insert two rows, check that they landed
        self.assertFalse(self.bq_client.insert_rows(table, [{'a': 1, 'b': 2.5},
                                                            # Intentionally omit 'b' here.
                                                            {'a': 3}]))
        self.assertRowsExpected(
                self.bq_client.query('SELECT * FROM `my_project.my_dataset.table1`',
                                     QueryJobConfig()),
                [[1, 2.5],
                 [3, None]])

        self.assertRowsExpected(
                self.bq_client.query('SELECT a FROM `my_project.my_dataset.table1` WHERE b is NULL',
                                     QueryJobConfig()),
                [[3]])

        # Insert two more rows, check that all four rows are now present.
        self.assertFalse(self.bq_client.insert_rows(table, [{'a': 5, 'b': 6.5},
                                                            {'a': 7, 'b': 8.25}]))
        self.assertRowsExpected(
                self.bq_client.query('SELECT * FROM `my_project.my_dataset.table1`',
                                     QueryJobConfig()),
                [[1, 2.5],
                 [3, None],
                 [5, 6.5],
                 [7, 8.25]])
示例#10
0
    def create_mock_tables(cls):
        # type: () -> None
        """Create mock tables"""

        super(MockBQTest, cls).create_mock_tables()

        cls.dates_table_name = cls.client.path('dates',
                                               delimiter=BQ_PATH_DELIMITER)
        cls.client.populate_table(
            cls.dates_table_name,
            [
                SchemaField('foo', 'DATETIME'),
                SchemaField('bar', 'INTEGER'),
                SchemaField('baz', 'INTEGER')
            ],
            [['1987-05-13 00:00:00', 2, 3], ['1950-01-01 00:00:00', 5, 6]],
        )

        cls.str_table_name = cls.client.path('strings',
                                             delimiter=BQ_PATH_DELIMITER)
        cls.client.populate_table(
            cls.str_table_name,
            [SchemaField('char1', 'STRING')],
            [['123'], ['456']],
        )

        cls.str_with_single_quotes_table_name = cls.client.path(
            'strings2', delimiter=BQ_PATH_DELIMITER)
        cls.client.populate_table(
            cls.str_with_single_quotes_table_name,
            [
                SchemaField('description', 'STRING'),
                SchemaField('is_good', 'BOOLEAN')
            ],
            [['Description of something with \'single quotes\'', True]],
        )

        cls.bool_table_name = cls.client.path('booleans',
                                              delimiter=BQ_PATH_DELIMITER)
        cls.client.populate_table(
            cls.bool_table_name,
            [
                SchemaField('str_col', 'STRING'),
                SchemaField('bool_col', 'BOOLEAN')
            ],
            [['yes', True], ['no', False], ['yes2', True]],
        )
示例#11
0
    def to_schema_field(self, name):
        # type: (str) -> SchemaField
        """Converts this type to a BigQuery SchemaField.

        Args:
            name: The name of the column.  This class represents a type; SchemaField represents
            a column, so it includes the type and also the name of the column.

        Returns:
            A SchemaField object corresponding to a column containing this class' type.
        """
        if isinstance(self.type_, BQScalarType):
            return SchemaField(name=name, field_type=self.type_.value, mode='REPEATED')
        raise NotImplementedError("SchemaField for ARRAY of {} not implemented"
                                  .format(self.type_))
def test_change_partition_by_range(bqtk: BQTestKit):
    with bqtk.project("it").dataset("dataset_foo").isolate() as ds:
        table_bar_schema = [SchemaField("my_int", "int64")]
        with ds.table("table_bar"). \
             with_schema(from_=table_bar_schema). \
             partition_by(Range(on_field="my_int",
                                start=0,
                                end=10000,
                                interval=10)). \
             isolate() as t:
            show_res = t.show()
            assert show_res is not None
            assert show_res.time_partitioning is None
            expected_range = RangePartitioning(field='my_int', range_=PartitionRange(end=10000, interval=10, start=0))
            assert show_res.range_partitioning == expected_range
示例#13
0
    def setUp(self):
        self.bq_client = Client('my_project')
        dataset_ref = DatasetReference('my_project', 'my_dataset')
        schema = [
            SchemaField(name="a", field_type='INT64'),
            SchemaField(name="b", field_type='FLOAT64'),
        ]
        self.source_table = Table(TableReference(dataset_ref, 'source_table'),
                                  schema)
        self.destination_table = Table(
            TableReference(dataset_ref, 'destination_table'), schema)
        self.bq_client.create_dataset(Dataset(dataset_ref))
        self.bq_client.create_table(self.source_table)
        # We don't create the destination table here; some tests do not want it created.

        # Stick two rows into source_table
        self.assertFalse(
            self.bq_client.insert_rows(self.source_table, [{
                'a': 1,
                'b': 2.5
            }, {
                'a': 3,
                'b': 4.25
            }]))
示例#14
0
    def create_mock_tables(cls):
        # type: () -> None
        """Create mock tables"""
        cls.src_table_name = cls.client.path('tmp',
                                             delimiter=BQ_PATH_DELIMITER)
        cls.client.populate_table(
            cls.src_table_name,
            FOO_BAR_BAZ_INTEGERS_SCHEMA,
            [[1, 2, 3], [4, 5, 6]],
        )

        cls.long_table_name = cls.client.path('long_table',
                                              delimiter=BQ_PATH_DELIMITER)
        cls.client.populate_table(cls.long_table_name, [
            SchemaField('foo', 'INTEGER'),
        ], [[1]] * LONG_TABLE_LENGTH)
示例#15
0
    def test_convert_between_schema_field_and_bq_type(self, bq_type,
                                                      schema_field):
        # type: (BQScalarType, SchemaField) -> None

        # Test scalar
        self.assertEqual(BQType.from_schema_field(schema_field), bq_type)
        self.assertEqual(bq_type.to_schema_field('foo'), schema_field)

        # Test array
        schema_array_field = SchemaField(name=schema_field.name,
                                         field_type=schema_field.field_type,
                                         mode='REPEATED')
        bq_array_type = BQArray(bq_type)
        self.assertEqual(BQType.from_schema_field(schema_array_field),
                         bq_array_type)
        self.assertEqual(bq_array_type.to_schema_field('foo'),
                         schema_array_field)
 def _get_columns_helper(self, columns, cur_columns):
     """
     Recurse into record type and return all the nested field names.
     As contributed by @sumedhsakdeo on issue #17
     """
     results = []
     for col in columns:
         results += [SchemaField(name='.'.join(col.name for col in cur_columns + [col]),
                                 field_type=col.field_type,
                                 mode=col.mode,
                                 description=col.description,
                                 fields=col.fields)]
         if col.field_type == 'RECORD':
             cur_columns.append(col)
             results += self._get_columns_helper(col.fields, cur_columns)
             cur_columns.pop()
     return results
    def __to_bq_schema(schema_df: pd.DataFrame):
        type_mapping = {
            'i': 'INTEGER',
            'b': 'BOOLEAN',
            'f': 'FLOAT',
            'O': 'STRING',
            'S': 'STRING',
            'U': 'STRING',
            'M': 'TIMESTAMP'
        }

        fields = []
        for column_name, dtype in schema_df.dtypes.iteritems():
            fields.append(
                SchemaField(column_name,
                            type_mapping.get(dtype.kind, 'STRING')))

        return fields
示例#18
0
    def create_test_table(cls, table_name, schema_file, data_file=None, table_postfix=''):
        # type: (str, str, Optional[str], str) -> ()
        """
        This method creates a table to be used in testing

        Args:
            table_name: A string with the name of the table
            schema_file: A string with the name of the schema file from which to build the test
                table
            data_file: A string describing the file with the data to use in testing. If None,
                create an empty table.
            table_postfix: A string to be added to the end of the table_name.
        """
        data = cls._load_csv_with_schema(schema_file, data_file) if data_file else []
        table_path = cls.client.path(table_name + table_postfix)
        with open(schema_file) as f:
            schema_json = json.load(f)
        schema_list = [SchemaField(row['name'], row['type']) for row in schema_json]
        cls.client.populate_table(table_path, schema_list, data)
示例#19
0
    def get_schema(self, dataset_id, table_name, project_id=None):
        # type: (str, str, Optional[str]) -> List[SchemaField]
        """Returns the schema of a table. Note that due to the imperfect mapping
        of SQLiteTypes to BQ types, these schemas won't be perfect. Anything relying heavily
        on correct schemas should use the real BigQuery client.

        Args:
            dataset_id: The dataset to query.
            table_name: The name of the table.
            project_id: The project ID of the table.
        Returns:
            A list of SchemaFields representing the schema.
        """
        # schema rows are in the format (order, name, type, ...)
        standardized_path = self.path(table_name,
                                      dataset_id,
                                      project_id,
                                      delimiter=MOCK_DELIMITER)
        # 'pragma' is SQLite's equivalent to DESCRIBE TABLE
        pragma_query = 'pragma table_info(\'' + standardized_path + '\')'
        single_row_query = 'SELECT * FROM ' + standardized_path + ' LIMIT 1'

        single_row = self.conn.execute(single_row_query).fetchall()
        schema = self.conn.execute(pragma_query).fetchall()

        returned_schema = []
        for i in range(len(schema)):
            row_name = schema[i][1]
            if len(single_row) > 0:
                row_type = self._db_type_to_bq_type(schema[i][2],
                                                    sample=single_row[0][i])
            else:
                row_type = self._db_type_to_bq_type(schema[i][2])
            # Repeated fields are not supported in mock BigQuery so we always set the mode
            # to nullable.
            returned_schema.append(
                SchemaField(row_name, row_type, mode='NULLABLE'))
        return returned_schema
示例#20
0
def _parse_schema_resource(info):
    """Parse a resource fragment into a schema field.

    :type info: mapping
    :param info: should contain a "fields" key to be parsed

    :rtype: list of :class:`SchemaField`, or ``NoneType``
    :returns: a list of parsed fields, or ``None`` if no "fields" key is
                present in ``info``.
    """
    if 'fields' not in info:
        return ()

    schema = []
    for r_field in info['fields']:
        name = r_field['name']
        field_type = r_field['type']
        mode = r_field.get('mode', 'NULLABLE')
        description = r_field.get('description')
        sub_fields = _parse_schema_resource(r_field)
        schema.append(
            SchemaField(name, field_type, mode, description, sub_fields))
    return schema
示例#21
0
    def _get_schema_for_field(column: str, methods: Sequence[dict]):
        field_type = 'STRING'
        field_mode = 'REQUIRED'

        method = next((method['method']
                       for method in methods if method['fieldName'] == column),
                      None)

        if SistrixApiClient.ENDPOINT_DOMAIN_VISIBILITYINDEX == method:
            field_type = SqlTypeNames.FLOAT
            field_mode = 'NULLABLE'
        if SistrixApiClient.ENDPOINT_DOMAIN_PAGES == method or \
            SistrixApiClient.ENDPOINT_DOMAIN_KEYWORDCOUNT_SEO == method or \
            SistrixApiClient.ENDPOINT_DOMAIN_KEYWORDCOUNT_SEO_TOP10 == method:
            field_type = SqlTypeNames.INTEGER
            field_mode = 'NULLABLE'

        if 'date' == column:
            field_type = SqlTypeNames.DATE

        if 'daily' == column or 'mobile' == column:
            field_type = SqlTypeNames.BOOLEAN

        return SchemaField(column, field_type, field_mode)
示例#22
0
    def test_table_exists_with_name(self):
        dataset_that_exists = 'dataset_{}'.format(
            self.make_n_digit_random_number(6))
        table_that_exists = 'table_{}'.format(
            self.make_n_digit_random_number(6))
        table_path_that_exists = self.client.path(table_that_exists,
                                                  dataset_that_exists)
        self.client.create_dataset_by_name(dataset_that_exists)
        self.addCleanup(lambda: self.client.delete_dataset_by_name(
            dataset_that_exists, delete_all_tables=True))
        self.client.create_tables_from_dict(
            {table_that_exists: [SchemaField('col', 'INT64')]},
            dataset_that_exists)

        self.assertTrue(
            self.client.table_exists_with_name(table_path_that_exists))
        self.assertFalse(
            self.client.table_exists_with_name(
                self.client.path(table_that_exists,
                                 'dataset_that_does_not_exist')))
        self.assertFalse(
            self.client.table_exists_with_name(
                self.client.path('table_that_does_not_exist',
                                 dataset_that_exists)))
示例#23
0
 def _prepare_schema(self):
   return [SchemaField(**row) for row in SCHEMA]
示例#24
0
def get_schema_field(item_definition: dict) -> SchemaField:
    schema_field = SchemaField(item_definition['dbName'],
                               item_definition['type'])
    return schema_field
示例#25
0
    def test_create_tables_from_dict_overwrite(self):
        # type: () -> None
        # Create the dataset once.
        self.client.create_tables_from_dict(
            {
                'empty_1': [
                    SchemaField('col1', 'INTEGER'),
                    SchemaField('col2', 'STRING')
                ],
                'empty_2':
                [SchemaField('col1', 'FLOAT'),
                 SchemaField('col2', 'INTEGER')]
            },
            replace_existing_tables=True)

        # Create it again with a different schema. Make sure the changes take since it should have
        # recreated the dataset.
        self.client.create_tables_from_dict(
            {
                'empty_1': [
                    SchemaField('col1_test1', 'INTEGER'),
                    SchemaField('col2_test2', 'STRING')
                ],
                'empty_2': [
                    SchemaField('col1_test1', 'FLOAT'),
                    SchemaField('col2_test2', 'INTEGER')
                ]
            },
            replace_existing_tables=True)
        self.assertEqual([('col1_test1', 'INTEGER', 'NULLABLE'),
                          ('col2_test2', 'STRING', 'NULLABLE')],
                         [(x.name, x.field_type, x.mode)
                          for x in self.client.get_schema(
                              self.default_test_dataset_id, 'empty_1')])
        self.assertEqual([('col1_test1', 'FLOAT', 'NULLABLE'),
                          ('col2_test2', 'INTEGER', 'NULLABLE')],
                         [(x.name, x.field_type, x.mode)
                          for x in self.client.get_schema(
                              self.default_test_dataset_id, 'empty_2')])

        # Try to create one of the tables again; it should raise a RuntimeError.
        with self.assertRaises(RuntimeError):
            self.client.create_tables_from_dict(
                {
                    'empty_1': [
                        SchemaField('col1', 'INTEGER'),
                        SchemaField('col2', 'STRING')
                    ],
                },
                replace_existing_tables=False)

        # Try to create a table not in the dataset. It should work fine.
        self.client.create_tables_from_dict(
            {
                'empty_3': [
                    SchemaField('col1', 'INTEGER'),
                    SchemaField('col2', 'STRING')
                ],
            },
            replace_existing_tables=False)
        self.assertEqual([('col1', 'INTEGER', 'NULLABLE'),
                          ('col2', 'STRING', 'NULLABLE')],
                         [(x.name, x.field_type, x.mode)
                          for x in self.client.get_schema(
                              self.default_test_dataset_id, 'empty_3')])
示例#26
0
class BqTypesTest(unittest.TestCase):
    @data(
        (BQScalarType.BOOLEAN, SchemaField(name='foo', field_type='BOOLEAN')),
        (BQScalarType.DATE, SchemaField(name='foo', field_type='DATE')),
        (BQScalarType.DATETIME, SchemaField(name='foo',
                                            field_type='DATETIME')),
        (BQScalarType.INTEGER, SchemaField(name='foo', field_type='INTEGER')),
        (BQScalarType.FLOAT, SchemaField(name='foo', field_type='FLOAT')),
        (BQScalarType.STRING, SchemaField(name='foo', field_type='STRING')),
        (BQScalarType.TIMESTAMP, SchemaField(name='foo',
                                             field_type='TIMESTAMP')),
    )
    @unpack
    def test_convert_between_schema_field_and_bq_type(self, bq_type,
                                                      schema_field):
        # type: (BQScalarType, SchemaField) -> None

        # Test scalar
        self.assertEqual(BQType.from_schema_field(schema_field), bq_type)
        self.assertEqual(bq_type.to_schema_field('foo'), schema_field)

        # Test array
        schema_array_field = SchemaField(name=schema_field.name,
                                         field_type=schema_field.field_type,
                                         mode='REPEATED')
        bq_array_type = BQArray(bq_type)
        self.assertEqual(BQType.from_schema_field(schema_array_field),
                         bq_array_type)
        self.assertEqual(bq_array_type.to_schema_field('foo'),
                         schema_array_field)

    @data(
        (BQScalarType.BOOLEAN, SchemaField(name='foo', field_type='BOOL')),
        (BQScalarType.INTEGER, SchemaField(name='foo', field_type='INT64')),
        (BQScalarType.FLOAT, SchemaField(name='foo', field_type='FLOAT64')),
    )
    @unpack
    def test_convert_from_standard_schema_field_to_bq_type(
            self, bq_type, schema_field):
        # type: (BQScalarType, SchemaField) -> None

        self.assertEqual(BQType.from_schema_field(schema_field), bq_type)

    @data(
        (BQScalarType.BOOLEAN, 'BOOL'),
        (BQScalarType.INTEGER, 'int64'),
        (BQScalarType.FLOAT, 'FLOAT64'),
        (BQScalarType.BOOLEAN, 'boolean'),
        (BQScalarType.DATE, 'DATE'),
        (BQScalarType.DATETIME, 'datetime'),
        (BQScalarType.INTEGER, 'INTEGER'),
        (BQScalarType.FLOAT, 'FlOaT'),
        (BQScalarType.STRING, 'STRING'),
        (BQScalarType.TIMESTAMP, 'timeSTAMP'),
    )
    @unpack
    def test_convert_from_string_to_bq_scalar_type(self, bq_type, string):
        # type: (BQScalarType, str) -> None
        self.assertEqual(BQScalarType.from_string(string), bq_type)
        self.assertEqual(BQScalarType.from_string(string.lower()), bq_type)

    @data(
        (BQScalarType.BOOLEAN, ),
        (BQScalarType.DATE, ),
        (BQScalarType.DATETIME, ),
        (BQScalarType.INTEGER, ),
        (BQScalarType.FLOAT, ),
        (BQScalarType.STRING, ),
        (BQScalarType.TIMESTAMP, ),
    )
    @unpack
    def test_two_arrays_of_same_type_are_same_object(self, bq_type):
        # type: (BQScalarType) -> None
        # Type objects are immutable, and we need to be able to compare them
        # (an array of ints is an array of ints, but it's not a string or an array of floats).
        # A way to achieve this is to ensure that all types, including arrays, are singletons.
        # So we test that for each scalar type, creating two arrays of it yields the same object.
        a1 = BQArray(bq_type)
        a2 = BQArray(bq_type)
        self.assertIs(a1, a2)

    @data((BQScalarType.BOOLEAN, np.bool_(True), True),
          (BQScalarType.DATE, np.datetime64('2019-01-07'),
           datetime.date(2019, 1, 7)),
          (BQScalarType.DATETIME, np.datetime64('2019-01-07T10:32:05.123456'),
           datetime.datetime(2019, 1, 7, 10, 32, 5, 123456)),
          (BQScalarType.INTEGER, np.float64(35.0), 35),
          (BQScalarType.FLOAT, np.float64(12.34), 12.34),
          (BQScalarType.STRING, np.string_('hello'), 'hello'),
          (BQScalarType.TIMESTAMP, np.datetime64('2019-01-07T10:32:05.123456'),
           datetime.datetime(2019, 1, 7, 10, 32, 5, 123456)))
    @unpack
    def test_convert(self, bq_type, np_object, py_object):
        # type: (BQScalarType, NumPyType, PythonType) -> None

        # First, convert from a NumPy-typed object to a Pandas-typed object.
        # Types are mostly the same except for np.datetime64 becomes pd.Timestamp
        # We do this by creating a Pandas Series containing the single object, and then
        # converting it to a sequence and extracting its single element.
        pd_object, = pd.Series(np_object)
        self.assertEqual(bq_type.convert(pd_object), py_object)

        # Test that for any type, a NaN converts to None
        self.assertIsNone(bq_type.convert(np.nan))

        # Now test the same conversion for a list (array) of objects.
        # Construct a Series containing a single row which is a list of three objects.
        pd_array_object, = pd.Series([(pd_object, ) * 3])
        self.assertEqual(
            BQArray(bq_type).convert(pd_array_object), (py_object, ) * 3)

        # Test that for any Array type, a NaN converts to None
        self.assertIsNone(BQArray(bq_type).convert(np.nan))

    @data(
        (BQScalarType.BOOLEAN, np.bool_),
        (BQScalarType.DATE, 'datetime64[ns]'),
        (BQScalarType.DATETIME, 'datetime64[ns]'),
        (BQScalarType.INTEGER, np.float64),
        (BQScalarType.FLOAT, np.float64),
        (BQScalarType.STRING, np.string_),
        (BQScalarType.TIMESTAMP, 'datetime64[ns]'),
    )
    @unpack
    def test_to_dtype(self, bq_type, np_type):
        # type: (BQScalarType, NumPyType) -> None
        self.assertEqual(bq_type.to_dtype(), np.dtype(np_type))
        # NumPy doesn't know from cell elements that are lists, so it just leaves it as an
        # uninterpreted Python object.
        self.assertEqual(BQArray(bq_type).to_dtype(), np.dtype('object'))

    def test_get_typed_series_as_list(self):
        typed_series = TypedSeries(
            pd.Series([(np.float64(1.5), np.float64(2.5), np.float64(3.0)),
                       (np.float64(2.5), np.float64(3.5), np.float64(4.0))]),
            BQArray(BQScalarType.FLOAT))
        self.assertEqual(typed_series.to_list(), [(1.5, 2.5, 3.0),
                                                  (2.5, 3.5, 4.0)])

    def test_get_typed_dataframe_schema(self):
        typed_dataframe = TypedDataFrame(
            pd.DataFrame(columns=['a', 'b']),
            [BQScalarType.BOOLEAN,
             BQArray(BQScalarType.FLOAT)])
        self.assertEqual(typed_dataframe.to_bq_schema(), [
            SchemaField(name='a', field_type='BOOLEAN'),
            SchemaField(name='b', field_type='FLOAT', mode='REPEATED')
        ])

    def test_get_typed_dataframe_as_list_of_lists(self):
        typed_dataframe = TypedDataFrame(
            pd.DataFrame(
                [[
                    np.bool_(True),
                    (np.float64(1.5), np.float64(2.5), np.float64(3.0))
                ],
                 [
                     np.bool_(False),
                     (np.float64(2.5), np.float64(3.5), np.float64(4.0))
                 ]],
                columns=['a', 'b']),
            [BQScalarType.BOOLEAN,
             BQArray(BQScalarType.FLOAT)])
        self.assertEqual(typed_dataframe.to_list_of_lists(),
                         [[True, (1.5, 2.5, 3.0)], [False, (2.5, 3.5, 4.0)]])

    @data(
        dict(names=[], expected_name=None),
        dict(names=['foo'], expected_name='foo'),
        dict(names=['foo', None, 'foo'], expected_name='foo'),
    )
    @unpack
    def test_coerce_names(self, names, expected_name):
        # type: (Sequence[str], Optional[str]) -> None
        self.assertEqual(expected_name, _coerce_names(names))

    @data(
        dict(names=['foo', 'bar']), )
    @unpack
    def test_coerce_names_error(self, names):
        # type: (Sequence[str]) -> None
        with self.assertRaisesRegexp(ValueError,
                                     'field names .* do not match'):
            _coerce_names(names)

    @data(
        ([BQScalarType.INTEGER], BQScalarType.INTEGER),
        ([None, BQScalarType.INTEGER], BQScalarType.INTEGER),
        ([BQScalarType.FLOAT], BQScalarType.FLOAT),
        ([BQScalarType.FLOAT, None], BQScalarType.FLOAT),
        ([BQScalarType.FLOAT, BQScalarType.FLOAT], BQScalarType.FLOAT),
        ([BQScalarType.STRING, BQScalarType.STRING], BQScalarType.STRING),
        ([BQScalarType.STRING, None, BQScalarType.STRING
          ], BQScalarType.STRING),
        ([BQScalarType.INTEGER, BQScalarType.FLOAT], BQScalarType.FLOAT),
        ([BQScalarType.STRING, BQScalarType.DATE], BQScalarType.DATE),
        ([BQScalarType.STRING, BQScalarType.TIMESTAMP
          ], BQScalarType.TIMESTAMP),
    )
    @unpack
    def test_implicitly_coerce(self, input_types, expected_supertype):
        # type: (List[BQScalarType], BQScalarType) -> None
        supertype = implicitly_coerce(*input_types)
        self.assertEqual(supertype, expected_supertype)

    @data(
        ([], "No types provided to merge"),
        ([BQScalarType.STRING, BQScalarType.INTEGER
          ], "Cannot implicitly coerce the given types:"),
        ([BQScalarType.STRING, BQScalarType.DATE, BQScalarType.TIMESTAMP
          ], "Cannot implicitly coerce the given types:"),
    )
    @unpack
    def test_implicitly_coerce_error(self, input_types, error):
        # type: (List[BQScalarType], str) -> None
        with self.assertRaisesRegexp(ValueError, error):
            implicitly_coerce(*input_types)
示例#27
0
    def test_begin_w_alternate_client(self):
        from google.cloud.bigquery.job import CreateDisposition
        from google.cloud.bigquery.job import LoadJobConfig
        from google.cloud.bigquery.job import SchemaUpdateOption
        from google.cloud.bigquery.job import WriteDisposition
        from google.cloud.bigquery.schema import SchemaField

        PATH = "/projects/%s/jobs" % (self.PROJECT, )
        RESOURCE = self._make_resource(ended=True)
        LOAD_CONFIGURATION = {
            "sourceUris": [self.SOURCE1],
            "destinationTable": {
                "projectId": self.PROJECT,
                "datasetId": self.DS_ID,
                "tableId": self.TABLE_ID,
            },
            "allowJaggedRows": True,
            "allowQuotedNewlines": True,
            "createDisposition": CreateDisposition.CREATE_NEVER,
            "encoding": "ISO-8559-1",
            "fieldDelimiter": "|",
            "ignoreUnknownValues": True,
            "maxBadRecords": 100,
            "nullMarker": r"\N",
            "quote": "'",
            "skipLeadingRows": "1",
            "sourceFormat": "CSV",
            "useAvroLogicalTypes": True,
            "writeDisposition": WriteDisposition.WRITE_TRUNCATE,
            "schema": {
                "fields": [
                    {
                        "name": "full_name",
                        "type": "STRING",
                        "mode": "REQUIRED",
                        "description": None,
                    },
                    {
                        "name": "age",
                        "type": "INTEGER",
                        "mode": "REQUIRED",
                        "description": None,
                    },
                ]
            },
            "schemaUpdateOptions": [SchemaUpdateOption.ALLOW_FIELD_ADDITION],
        }
        RESOURCE["configuration"]["load"] = LOAD_CONFIGURATION
        conn1 = _make_connection()
        client1 = _make_client(project=self.PROJECT, connection=conn1)
        conn2 = _make_connection(RESOURCE)
        client2 = _make_client(project=self.PROJECT, connection=conn2)
        full_name = SchemaField("full_name", "STRING", mode="REQUIRED")
        age = SchemaField("age", "INTEGER", mode="REQUIRED")
        config = LoadJobConfig()
        config.schema = [full_name, age]
        job = self._make_one(self.JOB_ID, [self.SOURCE1], self.TABLE_REF,
                             client1, config)
        config.allow_jagged_rows = True
        config.allow_quoted_newlines = True
        config.create_disposition = CreateDisposition.CREATE_NEVER
        config.encoding = "ISO-8559-1"
        config.field_delimiter = "|"
        config.ignore_unknown_values = True
        config.max_bad_records = 100
        config.null_marker = r"\N"
        config.quote_character = "'"
        config.skip_leading_rows = 1
        config.source_format = "CSV"
        config.use_avro_logical_types = True
        config.write_disposition = WriteDisposition.WRITE_TRUNCATE
        config.schema_update_options = [
            SchemaUpdateOption.ALLOW_FIELD_ADDITION
        ]
        with mock.patch(
                "google.cloud.bigquery.opentelemetry_tracing._get_final_span_attributes"
        ) as final_attributes:
            job._begin(client=client2)

        final_attributes.assert_called_with({"path": PATH}, client2, job)

        conn1.api_request.assert_not_called()
        self.assertEqual(len(conn2.api_request.call_args_list), 1)
        req = conn2.api_request.call_args_list[0]
        self.assertEqual(req[1]["method"], "POST")
        self.assertEqual(req[1]["path"], PATH)
        SENT = {
            "jobReference": {
                "projectId": self.PROJECT,
                "jobId": self.JOB_ID
            },
            "configuration": {
                "load": LOAD_CONFIGURATION
            },
        }
        self.maxDiff = None
        self.assertEqual(req[1]["data"], SENT)
        self._verifyResourceProperties(job, RESOURCE)
示例#28
0
def _make_field(field_type, mode="NULLABLE", name="testing", fields=()):
    from google.cloud.bigquery.schema import SchemaField

    return SchemaField(name=name, field_type=field_type, mode=mode, fields=fields)
示例#29
0
def get_rank_table_schema(src_schema):
    dst_schema_field = SchemaField('rank', 'INTEGER', mode='REQUIRED')
    dst_schema = [dst_schema_field] + src_schema
    return dst_schema
示例#30
0
from typing import Any, List, Tuple  # noqa: F401

import six
from ddt import data, ddt, unpack
from google.api_core.exceptions import BadRequest, NotFound
from google.cloud.bigquery import Dataset, DatasetReference, Table, TableReference
from google.cloud.bigquery.job import QueryJobConfig
from google.cloud.bigquery.schema import SchemaField

from purplequery.bq_types import PythonType  # noqa: F401
from purplequery.client import _FakeJob  # noqa: F401
from purplequery.client import Client
from six.moves import cStringIO

_TEST_SCHEMA = [
    SchemaField(name="num", field_type='INTEGER'),
    SchemaField(name="ID", field_type='STRING'),
    SchemaField(name="height", field_type='FLOAT'),
    SchemaField(name="likes_chocolate", field_type='BOOLEAN'),
    SchemaField(name="start_date", field_type='DATE'),
    SchemaField(name="mid_date", field_type='DATETIME'),
    SchemaField(name="end_time", field_type='TIMESTAMP'),
    SchemaField(name="xs", field_type='INTEGER', mode='REPEATED'),
]


class ClientTestBase(unittest.TestCase):
    def assertRowsExpected(self, query_job, expected_rows):
        # type: (_FakeJob, List[List[PythonType]]) -> None
        """Assert that query_job has finished and it contains the expected rows.