def cleanup_datasets(bigquery_client: bigquery.Client): yesterday = datetime.datetime.utcnow() - datetime.timedelta(days=1) for dataset in bigquery_client.list_datasets(): if (dataset.dataset_id.startswith(RESOURCE_PREFIX) and resource_name_to_date(dataset.dataset_id) < yesterday): bigquery_client.delete_dataset(dataset, delete_contents=True, not_found_ok=True)
def create_bq_dataset(dataset_name='price_data'): '''Create dataset if not exists''' client = Client() datasets = [ client.project + "." + i.dataset_id for i in list(client.list_datasets()) ] if client.project + "." + dataset_name not in datasets: dataset = Dataset(dataset_name) dataset.location = "US" client.create_dataset(dataset) else: print("Dataset already exists")
def list_datasets(client: bigquery.Client): """ Lists the dataset in project Args: client: BQ API client (default project defined in you GOOGLE_APPLICATION_CREDENTIALS) Returns: list Examples: list_datasets(client) """ return [d.dataset_id for d in list(client.list_datasets())]
def main(prefixes): client = Client() pattern = re.compile('|'.join('^{}.*$'.format(prefix) for prefix in prefixes)) ds_items = list(client.list_datasets()) for dataset in ds_items: ds_id = dataset.dataset_id if pattern.match(ds_id): print("Deleting dataset: {}".format(ds_id)) try: client.delete_dataset(dataset.reference, delete_contents=True) except NotFound: print(" NOT FOUND")
def get_tables_matching_patterns(client: bigquery.Client, patterns: List[str]) -> List[str]: """Get BigQuery tables matching the provided patterns.""" all_projects = None all_datasets = {} all_tables = {} matching_tables = [] for pattern in patterns: project, _, dataset_table = pattern.partition(":") dataset, _, table = dataset_table.partition(".") projects = [project or client.project] dataset = dataset or "*" table = table or "*" if _uses_wildcards(project): if all_projects is None: all_projects = [p.project_id for p in client.list_projects()] projects = [p for p in all_projects if fnmatchcase(project, p)] for project in projects: datasets = [dataset] if _uses_wildcards(dataset): if project not in all_datasets: all_datasets[project] = [ d.dataset_id for d in client.list_datasets(project) ] datasets = [ d for d in all_datasets[project] if fnmatchcase(d, dataset) ] for dataset in datasets: dataset = f"{project}.{dataset}" tables = [f"{dataset}.{table}"] if _uses_wildcards(table): if dataset not in all_tables: all_tables[dataset] = list(client.list_tables(dataset)) tables = [ f"{dataset}.{t.table_id}" for t in all_tables[dataset] if fnmatchcase(t.table_id, table) ] matching_tables += tables return matching_tables
def get_tables(project_id: str, client: Client, dataset_id: Optional[str] = None) -> Iterator[Table]: """ Gets BigQuery tables from a Google Cloud project. Args: project_id (str): ID of the project. dataset_id (Optional[str]): The ID of the dataset. If `None`, will retrieve tables from all datasets in project. client (Client): A Google Cloud Client instance. Yields: Table: A BigQuery table. """ dataset_refs = ([f"{project_id}.{dataset_id}"] if dataset_id else (dataset.reference for dataset in client.list_datasets(project=project_id))) datasets = (client.get_dataset(dataset_ref) for dataset_ref in dataset_refs) for dataset in datasets: for table in client.list_tables(dataset): yield client.get_table(table)
def cleanup_datasets(bq_client: bigquery.Client): for dataset in bq_client.list_datasets(): if prefixer.should_cleanup(dataset.dataset_id): bq_client.delete_dataset(dataset, delete_contents=True, not_found_ok=True)
def validate_dataset(self): client = Client(project='investing-management') datasets = [i.dataset_id for i in list(client.list_datasets())] if self.dataset not in datasets: create_bq_dataset(dataset_name=self.dataset)