Exemplo n.º 1
0
 def setup_method(self, method):
     self.bigquery = Bigquery(_get_project_id(), _get_private_key_path())
     self.biqguery_jupyter = BigqueryJupyter(_get_project_id(),
                                             _get_private_key_path())
     self.dataset_prefix = _get_dataset_prefix_random()
     self.destination_table = "{0}.{1}".format(self.dataset_prefix,
                                               TABLE_ID)
Exemplo n.º 2
0
    def test_run_query_batch_retry_wrong(self):
        test_id = "7"
        test_size = 10
        df = make_mixed_dataframe_v2(test_size)
        self.bigquery.upload(df, self.destination_table + test_id)

        with pytest.raises(GenericGBQException):
            Bigquery.run_with_retry(
                self.bigquery.query_batch,
                10,
                query="SELECT COUNT(***) as num_rows FROM {0}".format(
                    self.destination_table + test_id),
                strict=False)
Exemplo n.º 3
0
    def test_run_query_batch_retry_no_error(self):
        test_id = "8"
        test_size = 10
        df = make_mixed_dataframe_v2(test_size)
        self.bigquery.upload(df, self.destination_table + test_id)

        result, attempts = Bigquery.run_with_retry(
            self.bigquery.query,
            query="SELECT COUNT(*) as num_rows FROM {0}".format(
                self.destination_table + test_id),
            strict=False,
            priority='INTERACTIVE')

        assert result['num_rows'][0] == test_size and attempts == 1
    def query(self,
              query,
              dialect='standard',
              priority='INTERACTIVE',
              strict=True,
              local_cache=True,
              **kwargs):

        if Bigquery._check_strict(query, strict):
            raise Exception(
                'Strict mode error', "partition reference not found in query, "
                "please add a partitiondate, _partitiontime or _table_suffix restriction "
                "in the where-clause or set strict = False if you are confident in what you're doing."
            )

        querycomment = """/* Query launched from JupyterHub
            User: {user}
            Notebook: {notebook} */"""

        user = os.getenv('USER', 'pytt')

        notebook = os.path.abspath(os.path.curdir)
        commentedquery = "\n".join(
            [querycomment.format(user=user, notebook=notebook), query])

        if local_cache:
            fn = self._cache_path + hashlib.md5(
                self.project_id.encode('ascii') +
                query.encode('ascii')).hexdigest() + '.tmp'

            if os.path.exists(fn):
                log.info('Query cached.')
                df = pd.read_pickle(fn)
            else:
                df = self.super.query(commentedquery,
                                      dialect=dialect,
                                      strict=strict,
                                      priority=priority)
                with open(os.path.splitext(fn)[0] + '.qry', 'w') as f:
                    f.write(query)

                df.to_pickle(fn)
            return df
        else:
            return self.super.query(commentedquery,
                                    dialect=dialect,
                                    strict=strict,
                                    priority=priority)
Exemplo n.º 5
0
class TestQueries(object):
    @classmethod
    def setup_class(cls):
        _skip_if_no_project_id()
        _skip_if_no_private_key_path()

        _setup_common()

    def setup_method(self, method):
        self.bigquery = Bigquery(_get_project_id(), _get_private_key_path())
        self.biqguery_jupyter = BigqueryJupyter(_get_project_id(),
                                                _get_private_key_path())
        self.dataset_prefix = _get_dataset_prefix_random()
        self.destination_table = "{0}.{1}".format(self.dataset_prefix,
                                                  TABLE_ID)

    @classmethod
    def teardown_class(cls):
        pass

    def teardown_method(self, method):
        pass

    def test_run_query_interactive(self):
        test_id = "1"
        test_size = 10
        df = make_mixed_dataframe_v2(test_size)
        self.bigquery.upload(df, self.destination_table + test_id)

        result = self.bigquery.query(
            "SELECT COUNT(*) as num_rows FROM {0}".format(
                self.destination_table + test_id),
            strict=False,
            priority='INTERACTIVE')
        assert result['num_rows'][0] == test_size

    def test_run_query_interactive_local_cache(self):
        test_id = "2"
        test_size = 10
        df = make_mixed_dataframe_v2(test_size)
        self.biqguery_jupyter.upload(df, self.destination_table + test_id)

        self.biqguery_jupyter.query(
            "SELECT COUNT(*) as num_rows FROM {0}".format(
                self.destination_table + test_id),
            strict=False)

        self.biqguery_jupyter.upload(df,
                                     self.destination_table + test_id,
                                     if_exists='append')

        result = self.biqguery_jupyter.query(
            "SELECT COUNT(*) as num_rows FROM {0}".format(
                self.destination_table + test_id),
            strict=False)

        assert result['num_rows'][0] == test_size

    def test_run_query_batch(self):
        test_id = "3"
        test_size = 10
        df = make_mixed_dataframe_v2(test_size)
        self.bigquery.upload(df, self.destination_table + test_id)

        result = self.bigquery.query_batch(
            "SELECT COUNT(*) as num_rows FROM {0}".format(
                self.destination_table + test_id),
            strict=False)
        assert result['num_rows'][0] == test_size

    def test_run_query_async(self):
        test_id = "4"
        test_size = 10
        df = make_mixed_dataframe_v2(test_size)

        schema = self.bigquery.generate_schema(df)
        self.bigquery.table_create(self.dataset_prefix, TABLE_ID + test_id,
                                   schema)

        result = self.bigquery.query_async(
            "SELECT COUNT(*) as num_rows FROM {0}".format(
                self.destination_table + test_id),
            strict=False)
        assert isinstance(result, str)

    def test_run_query_strict(self):
        test_id = "5"
        test_size = 10
        df = make_mixed_dataframe_v2(test_size)

        schema = self.bigquery.generate_schema(df)
        self.bigquery.table_create(self.dataset_prefix, TABLE_ID + test_id,
                                   schema)

        with pytest.raises(Exception):
            self.bigquery.query_async(
                "SELECT COUNT(*) as num_rows FROM {0}".format(
                    self.destination_table + test_id))

        with pytest.raises(Exception):
            self.bigquery.query("SELECT COUNT(*) as num_rows FROM {0}".format(
                self.destination_table + test_id),
                                priority='INTERACTIVE')

    def test_run_query_to_table(self):
        test_id = "6"
        test_size = 10
        df = make_mixed_dataframe_v2(test_size)
        self.bigquery.upload(df, self.destination_table + test_id)

        self.bigquery.query_batch(
            "SELECT * FROM {0}".format(self.destination_table + test_id),
            strict=False,
            configuration={
                'query': {
                    'allowLargeResults': True,
                    'writeDisposition': 'WRITE_TRUNCATE',
                    'priority': 'BATCH',
                    'destinationTable': {
                        'projectId': self.bigquery.project_id,
                        'datasetId': self.dataset_prefix,
                        'tableId': TABLE_ID + test_id + '_copy'
                    }
                }
            })

        result = self.bigquery.query(
            "SELECT COUNT(*) as num_rows FROM {0}".format(
                self.destination_table + test_id + '_copy'),
            strict=False,
            priority='INTERACTIVE')
        assert result['num_rows'][0] == test_size

    def test_run_query_batch_retry_wrong(self):
        test_id = "7"
        test_size = 10
        df = make_mixed_dataframe_v2(test_size)
        self.bigquery.upload(df, self.destination_table + test_id)

        with pytest.raises(GenericGBQException):
            Bigquery.run_with_retry(
                self.bigquery.query_batch,
                10,
                query="SELECT COUNT(***) as num_rows FROM {0}".format(
                    self.destination_table + test_id),
                strict=False)

    def test_run_query_batch_retry_no_error(self):
        test_id = "8"
        test_size = 10
        df = make_mixed_dataframe_v2(test_size)
        self.bigquery.upload(df, self.destination_table + test_id)

        result, attempts = Bigquery.run_with_retry(
            self.bigquery.query,
            query="SELECT COUNT(*) as num_rows FROM {0}".format(
                self.destination_table + test_id),
            strict=False,
            priority='INTERACTIVE')

        assert result['num_rows'][0] == test_size and attempts == 1
Exemplo n.º 6
0
class TestPartitionedTableOperations(object):
    @classmethod
    def setup_class(cls):
        _skip_if_no_project_id()
        _skip_if_no_private_key_path()

        _setup_common()

    def setup_method(self, method):
        self.bigquery = Bigquery(_get_project_id(), _get_private_key_path())
        self.dataset_prefix = _get_dataset_prefix_random()
        self.destination_table = "{0}.{1}".format(self.dataset_prefix,
                                                  TABLE_ID)

    @classmethod
    def teardown_class(cls):
        pass

    def teardown_method(self, method):
        pass

    def test_table_create(self):
        test_id = "1"
        test_size = 10
        df = make_mixed_dataframe_v2(test_size)

        schema = self.bigquery.generate_schema(df)
        self.bigquery.table_create(self.dataset_prefix,
                                   TABLE_ID + test_id,
                                   schema,
                                   body={'timePartitioning': {
                                       'type': 'DAY'
                                   }})

        actual = self.bigquery.resource(self.dataset_prefix,
                                        TABLE_ID + test_id)
        assert actual['timePartitioning']['type'] == 'DAY'

    def test_partition_delete(self):
        test_id = "2"
        test_size = 10
        df = make_mixed_dataframe_v2(test_size)

        schema = self.bigquery.generate_schema(df)
        self.bigquery.table_create(self.dataset_prefix,
                                   TABLE_ID + test_id,
                                   schema,
                                   body={'timePartitioning': {
                                       'type': 'DAY'
                                   }})

        self.bigquery.upload(df, self.destination_table + test_id + '_buffer')

        self.bigquery.query(
            f"select * from {self.destination_table}{test_id}_buffer",
            configuration={
                'query': {
                    'destinationTable': {
                        'projectId':
                        _get_project_id(),
                        'datasetId':
                        self.dataset_prefix,
                        'tableId':
                        TABLE_ID + test_id + '$' +
                        datetime.today().strftime("%Y%m%d")
                    },
                    'createDisposition': 'CREATE_IF_NEEDED',
                    'writeDisposition': 'WRITE_TRUNCATE',
                    'allowLargeResults': True,
                    'timePartitioning': {
                        'type': 'DAY'
                    }
                }
            },
            strict=False)

        result = self.bigquery.query(
            "SELECT COUNT(*) as num_rows FROM {0}".format(
                self.destination_table + test_id + '$' +
                datetime.today().strftime("%Y%m%d")),
            dialect='legacy',
            priority='INTERACTIVE')

        assert result['num_rows'][0] == test_size

        self.bigquery.table_delete(
            self.dataset_prefix,
            TABLE_ID + test_id + '$' + datetime.today().strftime("%Y%m%d"))

        result = self.bigquery.query(
            "SELECT COUNT(*) as num_rows FROM {0}".format(
                self.destination_table + test_id + '$' +
                datetime.today().strftime("%Y%m%d")),
            dialect='legacy',
            priority='INTERACTIVE')

        assert result['num_rows'][0] == 0

    def test_upload_data_to_partition(self):
        test_id = "3"
        test_size = 10
        df = make_mixed_dataframe_v2(test_size)

        schema = self.bigquery.generate_schema(df)
        self.bigquery.table_create(self.dataset_prefix,
                                   TABLE_ID + test_id,
                                   schema,
                                   body={'timePartitioning': {
                                       'type': 'DAY'
                                   }})

        self.bigquery.upload(
            df, self.destination_table + test_id + '$' +
            datetime.today().strftime("%Y%m%d"))

        result = self.bigquery.query(
            "SELECT COUNT(*) as num_rows FROM {0}".format(
                self.destination_table + test_id + '$' +
                datetime.today().strftime("%Y%m%d")),
            dialect='legacy',
            priority='INTERACTIVE')

        assert result['num_rows'][0] == test_size

    def test_append_data_to_partition(self):
        test_id = "4"
        test_size = 10
        df = make_mixed_dataframe_v2(test_size)

        schema = self.bigquery.generate_schema(df)
        self.bigquery.table_create(self.dataset_prefix,
                                   TABLE_ID + test_id,
                                   schema,
                                   body={'timePartitioning': {
                                       'type': 'DAY'
                                   }})

        self.bigquery.upload(
            df, self.destination_table + test_id + '$' +
            datetime.today().strftime("%Y%m%d"))
        self.bigquery.upload(df,
                             self.destination_table + test_id + '$' +
                             datetime.today().strftime("%Y%m%d"),
                             if_exists='append')

        result = self.bigquery.query(
            "SELECT COUNT(*) as num_rows FROM {0}".format(
                self.destination_table + test_id + '$' +
                datetime.today().strftime("%Y%m%d")),
            dialect='legacy',
            priority='INTERACTIVE')

        assert result['num_rows'][0] == test_size * 2

    def test_replace_data_in_partition(self):
        test_id = "5"
        test_size = 10
        df = make_mixed_dataframe_v2(test_size * 10)
        df2 = make_mixed_dataframe_v2(test_size)

        self.bigquery.upload(df, self.destination_table + test_id + '_buffer')

        self.bigquery.query(
            f"select * from {self.destination_table}{test_id}_buffer",
            configuration={
                'query': {
                    'destinationTable': {
                        'projectId':
                        _get_project_id(),
                        'datasetId':
                        self.dataset_prefix,
                        'tableId':
                        TABLE_ID + test_id + '$' +
                        datetime.today().strftime("%Y%m%d")
                    },
                    'createDisposition': 'CREATE_IF_NEEDED',
                    'writeDisposition': 'WRITE_TRUNCATE',
                    'allowLargeResults': True,
                    'timePartitioning': {
                        'type': 'DAY'
                    }
                }
            },
            strict=False)

        result = self.bigquery.query(
            "SELECT COUNT(*) as num_rows FROM {0}".format(
                self.destination_table + test_id + '$' +
                datetime.today().strftime("%Y%m%d")),
            dialect='legacy',
            priority='INTERACTIVE')

        assert result['num_rows'][0] == test_size * 10

        self.bigquery.upload(df2,
                             self.destination_table + test_id + '$' +
                             datetime.today().strftime("%Y%m%d"),
                             if_exists='replace')

        result = self.bigquery.query(
            "SELECT COUNT(*) as num_rows FROM {0}".format(
                self.destination_table + test_id + '$' +
                datetime.today().strftime("%Y%m%d")),
            dialect='legacy',
            priority='INTERACTIVE')

        assert result['num_rows'][0] == test_size

    def test_write_to_the_future(self):
        test_id = "6"
        test_size = 10
        df = make_mixed_dataframe_v2(test_size)

        schema = self.bigquery.generate_schema(df)
        self.bigquery.table_create(self.dataset_prefix,
                                   TABLE_ID + test_id,
                                   schema,
                                   body={'timePartitioning': {
                                       'type': 'DAY'
                                   }})

        future = (datetime.today() + timedelta(days=60)).strftime("%Y%m%d")

        self.bigquery.upload(df,
                             self.destination_table + test_id + '$' + future,
                             if_exists='replace')

        result = self.bigquery.query(
            "SELECT COUNT(*) as num_rows FROM {0}".format(
                self.destination_table + test_id + '$' + future),
            dialect='legacy',
            priority='INTERACTIVE')

        assert result['num_rows'][0] == test_size

    def test_write_to_the_past(self):
        test_id = "7"
        test_size = 10
        df = make_mixed_dataframe_v2(test_size)

        schema = self.bigquery.generate_schema(df)
        self.bigquery.table_create(self.dataset_prefix,
                                   TABLE_ID + test_id,
                                   schema,
                                   body={'timePartitioning': {
                                       'type': 'DAY'
                                   }})

        past = (datetime.today() - timedelta(days=600)).strftime("%Y%m%d")

        self.bigquery.upload(df,
                             self.destination_table + test_id + '$' + past,
                             if_exists='replace')

        result = self.bigquery.query(
            "SELECT COUNT(*) as num_rows FROM {0}".format(
                self.destination_table + test_id + '$' + past),
            dialect='legacy',
            priority='INTERACTIVE')

        assert result['num_rows'][0] == test_size
Exemplo n.º 7
0
class TestTableOperations(object):
    @classmethod
    def setup_class(cls):
        _skip_if_no_project_id()
        _skip_if_no_private_key_path()

        _setup_common()

    def setup_method(self, method):
        self.bigquery = Bigquery(_get_project_id(), _get_private_key_path())
        self.dataset_prefix = _get_dataset_prefix_random()
        self.destination_table = "{0}.{1}".format(self.dataset_prefix,
                                                  TABLE_ID)

    @classmethod
    def teardown_class(cls):
        pass

    def teardown_method(self, method):
        pass

    def test_table_create(self):
        test_id = "1"
        test_size = 10
        df = make_mixed_dataframe_v2(test_size)

        schema = self.bigquery.generate_schema(df)
        self.bigquery.table_create(self.dataset_prefix, TABLE_ID + test_id,
                                   schema)

        assert self.bigquery.verify_schema(self.dataset_prefix,
                                           TABLE_ID + test_id, schema)

    def test_verify_schema_case_insensitive(self):
        test_id = "2"
        test_size = 10
        df = make_mixed_dataframe_v2(test_size)

        self.bigquery.upload(df, self.destination_table + test_id)

        assert self.bigquery.verify_schema(
            self.dataset_prefix, TABLE_ID + test_id, {
                'fields': [
                    {
                        'name': 'Bools',
                        'type': 'BOOLEAN'
                    },
                    {
                        'name': 'flts',
                        'type': 'FLOAT'
                    },
                    {
                        'name': 'ints',
                        'type': 'INTEGER'
                    },
                    {
                        'name': 'strS',
                        'type': 'STRING'
                    },
                    {
                        'name': 'TIMES',
                        'type': 'TIMESTAMP'
                    },
                ]
            })

    def test_table_delete(self):
        test_id = "3"
        test_size = 10
        df = make_mixed_dataframe_v2(test_size)

        schema = self.bigquery.generate_schema(df)
        self.bigquery.table_create(self.dataset_prefix, TABLE_ID + test_id,
                                   schema)

        self.bigquery.table_delete(self.dataset_prefix, TABLE_ID + test_id)

        with pytest.raises(GenericGBQException):
            self.bigquery.verify_schema(self.dataset_prefix,
                                        TABLE_ID + test_id, df)

    def test_upload_data(self):
        test_id = "4"
        test_size = 10
        df = make_mixed_dataframe_v2(test_size)

        self.bigquery.upload(df, self.destination_table + test_id)

        result = self.bigquery.query(
            "SELECT COUNT(*) as num_rows FROM {0}".format(
                self.destination_table + test_id),
            strict=False,
            priority='INTERACTIVE')
        assert result['num_rows'][0] == test_size

    def test_table_copy(self):
        test_id = "5"
        test_size = 10
        df = make_mixed_dataframe_v2(test_size)

        self.bigquery.upload(df, self.destination_table + test_id)

        self.bigquery.query_batch(
            "SELECT * FROM {0}".format(self.destination_table + test_id),
            strict=False,
            configuration={
                'query': {
                    'allowLargeResults': True,
                    'writeDisposition': 'WRITE_TRUNCATE',
                    'priority': 'BATCH',
                    'destinationTable': {
                        'projectId': self.bigquery.project_id,
                        'datasetId': self.dataset_prefix,
                        'tableId': TABLE_ID + test_id + '_intermediate'
                    }
                }
            })

        self.bigquery.table_copy(self.dataset_prefix,
                                 TABLE_ID + test_id + '_intermediate',
                                 self.dataset_prefix,
                                 TABLE_ID + test_id + "_copy")

        result = self.bigquery.query(
            "SELECT COUNT(*) as num_rows FROM {0}".format(
                self.destination_table + test_id + "_copy"),
            strict=False,
            priority='INTERACTIVE')
        assert result['num_rows'][0] == test_size