示例#1
0
class DatasetBackupHandler(webapp2.RequestHandler):
    def __init__(self, request=None, response=None):
        super(DatasetBackupHandler, self).__init__(request, response)

        self.BQ = BigQuery()

        # now let's check if this task is not a retry of some previous (which
        # failed for some reason) if so - let's log when it hits the defined
        # mark so we can catch it on monitoring:
        Tasks.log_task_metadata_for(request=self.request)

    def post(self):
        project_id = self.request.get('projectId')
        dataset_id = self.request.get('datasetId')
        logging.info('Backing up dataset: ' + dataset_id)
        self.BQ.for_each_table(project_id=project_id,
                               dataset_id=dataset_id,
                               func=self.schedule_backup_task)

    # pylint: disable=R0201
    def schedule_backup_task(self, projectId, datasetId, tableId):
        logging.info("Schedule_backup_task: '%s:%s.%s'", projectId, datasetId,
                     tableId)
        task = Tasks.create(method='GET',
                            url='/tasks/backups/table/{0}/{1}/{2}'.format(
                                projectId, datasetId, tableId))
        Tasks.schedule('backup-worker', task)
示例#2
0
    def __init__(self, request=None, response=None):
        super(DatasetBackupHandler, self).__init__(request, response)

        self.BQ = BigQuery()

        # now let's check if this task is not a retry of some previous (which
        # failed for some reason) if so - let's log when it hits the defined
        # mark so we can catch it on monitoring:
        Tasks.log_task_metadata_for(request=self.request)
示例#3
0
    def test_get_dataset_cached_should_only_call_bq_once_but_response_is_cached(
            self, _):
        # given
        self._create_http.return_value = \
            self.__create_dataset_responses_with_only_one_response_for_get_dataset()
        # when
        bq = BigQuery()
        result1 = bq.get_dataset_cached('project', 'dataset')
        result2 = bq.get_dataset_cached('project', 'dataset')

        # then
        self.assertEqual(result1, result2)
示例#4
0
class BackupScheduler(object):
    def __init__(self):
        self.big_query = BigQuery()
        self.request_correlation_id = str(uuid.uuid4())

    def iterate_over_all_datasets_and_schedule_backups(self):
        custom_project_list = configuration.backup_settings_custom_project_list
        if custom_project_list:
            project_ids = custom_project_list
            logging.info(
                'Only projects specified in the configuration will'
                ' be backed up: %s', project_ids)
        else:
            project_ids = list(self.big_query.list_project_ids())

        logging.info('Scheduling backups of %s projects', len(project_ids))
        for project_id in project_ids:
            try:
                self.__list_and_backup_datasets(project_id)
            except Exception as ex:
                error_message = 'Failed to list and backup datasets: ' + str(
                    ex)
                ErrorReporting().report(error_message)

    def __list_and_backup_datasets(self, project_id):
        if project_id in configuration.projects_to_skip:
            logging.info('Skipping project: %s', project_id)
            return

        logging.info('Backing up project: %s, request_correlation_id: %s',
                     project_id, self.request_correlation_id)
        for dataset_id in self.big_query.list_dataset_ids(project_id):
            try:
                self.__backup_dataset(project_id, dataset_id)
            except Exception as ex:
                error_message = 'Failed to backup dataset: ' + str(ex)
                ErrorReporting().report(error_message)

    def __backup_dataset(self, project_id, dataset_id):
        logging.info('Backing up dataset: %s', dataset_id)
        task = Tasks.create(url='/tasks/backups/dataset',
                            params={
                                'projectId': project_id,
                                'datasetId': dataset_id
                            },
                            headers={
                                request_correlation_id.HEADER_NAME:
                                self.request_correlation_id
                            })
        Tasks.schedule('backup-scheduler', task)
示例#5
0
class TableRetention(object):
    def __init__(self, policy):
        self.big_query_service = BigQuery()
        self.policy = policy

    def perform_retention(self, table_reference, table_key):
        backups = Backup.get_all_backups_sorted(ndb.Key(urlsafe=table_key))
        logging.debug("Fetched %s backups for the table: %s", len(backups),
                      table_reference)

        if not ShouldPerformRetentionPredicate.test(backups):
            return

        logging.info("Retention policy used for table '%s': '%s'",
                     table_reference,
                     type(self.policy).__name__)

        for backup in self.policy\
                .get_backups_eligible_for_deletion(backups=backups,
                                                   table_reference=table_reference):
            self.__delete_backup_in_bq_and_update_datastore(backup)

    def __delete_backup_in_bq_and_update_datastore(self, backup):
        try:
            table_reference = TableReference(configuration.backup_project_id,
                                             backup.dataset_id,
                                             backup.table_id)

            self.big_query_service.delete_table(table_reference)
            logging.debug(
                u"Table %s deleted from BigQuery. "
                u"Updating datastore. Retention policy used: '%s'",
                table_reference,
                type(self.policy).__name__)
            Backup.mark_backup_deleted(backup.key)
        except TableNotFoundException:
            Backup.mark_backup_deleted(backup.key)
            logging.warning(
                u"Table '%s' was not found. But we updated datastore anyway",
                backup.table_id)
        except HttpError as ex:
            error_message = u"Unexpected HttpError occurred while deleting " \
                            u"table '{}', error: {}: {}"\
                .format(backup.table_id, type(ex), ex)
            logging.exception(error_message)
        except Exception as ex:
            error_message = u"Could not delete backup '{}' error: {}: {}"\
                .format(backup.table_id, type(ex), ex)
            logging.exception(error_message)
 def __get_table_or_partition(project_id, dataset_id, table_id,
                              partition_id):
     table_metadata = BigQuery().get_table(
         project_id, dataset_id,
         BigQueryTableMetadata.get_table_id_with_partition_id(
             table_id, partition_id))
     return BigQueryTableMetadata(table_metadata)
示例#7
0
 def test_execute_query_when_executing_long_query(self, _):
     # given
     self._create_http.return_value = self.__execute_long_query_responses()
     # when
     result = BigQuery().execute_query("SELECT * FROM tableXYZ")
     # then
     self.assertEqual(result, [
         {
             "f": [
                 {
                     "v": "a-gcp-project2"
                 },
                 {
                     "v": "test1"
                 }
             ]
         },
         {
             "f": [
                 {
                     "v": "a-gcp-project3"
                 },
                 {
                     "v": "smoke_test_US"
                 }
             ]
         }
     ])
 def __init__(self):
     big_query = BigQuery()
     self.querier = SLIViewQuerier(big_query, QualityQuerySpecification())
     self.streamer = SLIResultsStreamer(table_id="SLI_backup_quality")
     self.table_newer_modification_predicate = SLITableNewerModificationPredicate(
         big_query)
     self.table_existence_predicate = SLITableExistsPredicate(
         big_query, QualityQuerySpecification)
示例#9
0
    def test_iterating_projects(self, _):
        # given
        self._create_http.return_value = self.__create_project_list_responses()
        bq = BigQuery()
        # when
        project_ids, next_page_token = bq.list_project_ids()

        # then
        self.assertEqual(self.count(project_ids), 3)
        self.assertEqual(next_page_token, '3')

        # when (next_page_token)
        project_ids, next_page_token = bq.list_project_ids(
            page_token=next_page_token)
        # then
        self.assertEqual(self.count(project_ids), 1)
        self.assertEqual(next_page_token, None)
    def test_should_return_false_when_there_is_no_schema(self):
        # given
        sli_table = self.__create_non_partitioned_sli_table()

        # when
        exists = SLITableExistsPredicate(BigQuery(), LatencyQuerySpecification).exists(sli_table)

        # then
        self.assertFalse(exists)
    def test_should_return_true_for_existing_partition(self):
        # given
        sli_table = self.__create_partitioned_sli_table()

        # when
        exists = SLITableExistsPredicate(BigQuery(), LatencyQuerySpecification).exists(sli_table)

        # then
        self.assertTrue(exists)
示例#12
0
    def test_iterating_tables(self, _):
        # given
        self._create_http.return_value = self.__create_tables_list_responses()

        # when
        tables_ids = BigQuery().list_table_ids("project1233", "dataset_id")

        # then
        self.assertEqual(self.count(tables_ids), 5)
示例#13
0
    def test_iterating_datasets(self, _):
        # given
        self._create_http.return_value = self.__create_dataset_list_responses()

        # when
        dataset_ids = BigQuery().list_dataset_ids("project123")

        # then
        self.assertEqual(self.count(dataset_ids), 3)
示例#14
0
    def test_iterating_projects(self, _):
        # given
        self._create_http.return_value = self.__create_project_list_responses()

        # when
        project_ids = BigQuery().list_project_ids()

        # then
        self.assertEqual(self.count(project_ids), 4)
示例#15
0
    def test_iterating_datasets(self, _):
        # given
        self._create_http.return_value = self.__create_dataset_list_responses()
        bq = BigQuery()
        # when
        dataset_ids, next_page_token = bq.list_dataset_ids("project123")

        # then
        self.assertEqual(self.count(dataset_ids), 2)
        self.assertEqual(next_page_token, 'FMLMpsxvgM')

        # when
        dataset_ids, next_page_token = bq.list_dataset_ids(
            "project123", page_token=next_page_token)

        # then
        self.assertEqual(self.count(dataset_ids), 1)
        self.assertEqual(next_page_token, None)
    def start(table_reference):
        big_query_table_metadata = BigQueryTableMetadata.get_table_by_reference(
            table_reference)

        BackupProcess(
            table_reference=table_reference,
            big_query=BigQuery(),
            big_query_table_metadata=big_query_table_metadata,
            should_backup_predicate=OnDemandBackupPredicate()).start()
示例#17
0
 def test_listing_table_partitions_when_table_not_exist_should_throw_table_not_found_exception(
         self, _):
     # given
     self._create_http.return_value = self.__create_table_partititions_list_responses_table_404_not_found(
     )
     # when & then
     with self.assertRaises(TableNotFoundException) as exception:
         BigQuery().list_table_partitions("project123", "dataset123",
                                          "table123")
示例#18
0
    def test_when_dataset_not_exist_then_iterating_tables_should_not_return_any_table(
            self, _):
        # given
        self._create_http.return_value = self.__create_dataset_not_found_during_tables_list_responses()

        # when
        tables_ids = BigQuery().list_table_ids("projectid", "dataset_id")

        # then
        self.assertEqual(self.count(tables_ids), 0)
示例#19
0
    def test_iterating_tables_should_retry_if_gets_http_503_response_once(
            self, func, _, _1):
        # given
        self._create_http.return_value = self.__create_tables_list_responses_with_503()

        # when
        BigQuery().for_each_table("project1233", "dataset_id", func)

        # then
        self.assertEquals(5, func.call_count)
示例#20
0
    def test_iterating_tables(self, _):
        # given
        self._create_http.return_value = self.__create_tables_list_responses()
        bq = BigQuery()
        # when
        tables_ids, next_page_token = bq.list_table_ids(
            "project1233", "dataset_id")

        # then
        self.assertEqual(self.count(tables_ids), 4)
        self.assertEqual(next_page_token, 'table_id_5')

        # when
        tables_ids, next_page_token = bq.list_table_ids(
            "project1233", "dataset_id", page_token=next_page_token)

        # then
        self.assertEqual(self.count(tables_ids), 1)
        self.assertEqual(next_page_token, None)
示例#21
0
    def test_insert_job_forwarding_503_error(self, _):
        # given
        self._create_http.return_value = self.__create_503_response()

        # when
        with self.assertRaises(HttpError) as context:
            BigQuery().insert_job('project_id', {})

        # then
        self.assertEqual(context.exception.resp.status, 503)
    def test_should_not_list_partitions_in_non_partitioned_table(self, list_table_partitions):
        # given
        sli_table = self.__create_non_partitioned_sli_table()

        # when
        exists = SLITableExistsPredicate(BigQuery(), LatencyQuerySpecification).exists(sli_table)

        # then
        self.assertTrue(exists)
        list_table_partitions.assert_not_called()
示例#23
0
 def __init__(self, x_days):
     self.x_days = x_days
     big_query = BigQuery()
     self.streamer = SLIResultsStreamer(
         table_id="SLI_backup_creation_latency"
     )
     self.table_existence_predicate = SLITableExistsPredicate(big_query, LatencyQuerySpecification)
     self.table_recreation_predicate = SLITableRecreationPredicate(big_query)
     self.table_emptiness_predicate = SLITableEmptinessPredicate(big_query)
     self.table_has_any_backup_predicate = SLITableHasAnyBackupPredicate()
示例#24
0
    def test_should_return_false_if_backup_table_doesnt_exists(self):
        # given
        sli_table = self.__create_sli_entry_without_census_data()

        # when
        is_not_seen_by_census = SLIBackupTableNotSeenByCensusPredicate(
            BigQuery(), QualityQuerySpecification).is_not_seen_by_census(
            sli_table)

        # then
        self.assertFalse(is_not_seen_by_census)
示例#25
0
 def __init__(self, x_days):
     self.x_days = x_days
     big_query = BigQuery()
     self.querier = SLIViewQuerier(big_query,
                                   LatencyQuerySpecification(self.x_days))
     self.streamer = SLIResultsStreamer(
         table_id="SLI_backup_creation_latency")
     self.table_existence_predicate = SLITableExistsPredicate(
         big_query, LatencyQuerySpecification)
     self.table_recreation_predicate = SLITableRecreationPredicate(
         big_query)
示例#26
0
    def test_should_return_false_if_backup_table_havent_data_from_census_and_datastore_num_bytes_are_different_than_reality(
        self):
        # given
        sli_table = self.__create_sli_entry_without_census_data()

        # when
        is_not_seen_by_census = SLIBackupTableNotSeenByCensusPredicate(
            BigQuery(), QualityQuerySpecification).is_not_seen_by_census(
            sli_table)

        # then
        self.assertFalse(is_not_seen_by_census)
 def create_the_same_empty_table(self, target_reference):
     body = {
         "tableReference": {
             "projectId": target_reference.get_project_id(),
             "datasetId": target_reference.get_dataset_id(),
             "tableId": target_reference.get_table_id(),
         },
         "timePartitioning": self.table_metadata.get("timePartitioning"),
         "schema": self.table_metadata.get("schema")
     }
     BigQuery().create_table(target_reference.get_project_id(),
                             target_reference.get_dataset_id(), body)
示例#28
0
 def __schedule(source_big_query_table, target_big_query_table, job_id,
                create_disposition, write_disposition):
     logging.info("Scheduling job ID: " + job_id)
     target_project_id = target_big_query_table.get_project_id()
     job_data = {
         "jobReference": {
             "jobId": job_id,
             "projectId": target_project_id
         },
         "configuration": {
             "copy": {
                 "sourceTable": {
                     "projectId": source_big_query_table.get_project_id(),
                     "datasetId": source_big_query_table.get_dataset_id(),
                     "tableId": source_big_query_table.get_table_id(),
                 },
                 "destinationTable": {
                     "projectId": target_project_id,
                     "datasetId": target_big_query_table.get_dataset_id(),
                     "tableId": target_big_query_table.get_table_id(),
                 },
                 "createDisposition": create_disposition,
                 "writeDisposition": write_disposition
             }
         }
     }
     try:
         job_reference = BigQuery().insert_job(target_project_id, job_data)
         logging.info("Successfully insert: %s", job_reference)
         return job_reference
     except HttpError as bq_error:
         copy_job_error = BigQueryJobError(bq_error, source_big_query_table,
                                           target_big_query_table)
         if copy_job_error.is_deadline_exceeded():
             job_json = CopyJobService.__get_job(job_id, target_project_id,
                                                 copy_job_error.location)
             return CopyJobService.__to_bq_job_reference(job_json)
         elif copy_job_error.should_be_retried():
             logging.warning(copy_job_error)
             return BigQueryJobReference(
                 project_id=target_project_id,
                 job_id=job_id,
                 location=BigQueryTableMetadata.
                 get_table_by_big_query_table(
                     source_big_query_table).get_location())
         else:
             logging.exception(copy_job_error)
             return copy_job_error
     except Exception as error:
         logging.error("%s Exception thrown during Copy Job creation: %s",
                       type(error), error)
         raise error
示例#29
0
    def test_listing_table_partitions(self, _):
        # given
        self._create_http.return_value = self.__create_table_partititions_list_responses()
        # when
        partitions = BigQuery() \
            .list_table_partitions("project123", "dataset123", "table123")

        # then
        self.assertEqual(self.count(partitions), 5)
        self.assertEqual(partitions[0]['partitionId'], '20170317')
        self.assertEqual(partitions[0]['creationTime'],
                         '2017-03-17 14:32:17.755000')
        self.assertEqual(partitions[0]['lastModifiedTime'],
                         '2017-03-17 14:32:19.289000')
示例#30
0
class OrganizationBackupScheduler(object):
    def __init__(self):
        self.big_query = BigQuery()
        self.custom_projects_list = configuration.backup_settings_custom_project_list
        self.projects_to_skip = configuration.projects_to_skip

    def schedule_backup(self, page_token=None):
        if self.custom_projects_list:
            self._schedule_project_backup_scheduler_for_custom_project_list()
            return

        projects_ids_to_backup, next_page_token = self.big_query.list_project_ids(
            page_token=page_token)

        self._schedule_project_backup_scheduler_tasks(projects_ids_to_backup)

        if next_page_token:
            logging.info(
                u'Scheduling Organisation Backup Scheduler task for page_token: %s',
                next_page_token)
            Tasks.schedule(
                'backup-scheduler',
                TaskCreator.create_organisation_backup_scheduler_task(
                    page_token=next_page_token))

    def _schedule_project_backup_scheduler_tasks(self, project_ids):
        logging.info(
            u'Scheduling Project Backup Scheduler tasks for %s projects: %s',
            len(project_ids), project_ids)

        tasks = []

        for project_id in project_ids:

            if project_id not in self.projects_to_skip:
                tasks.append(
                    TaskCreator.create_project_backup_scheduler_task(
                        project_id=project_id))
            else:
                logging.info(u'Project %s is skipped.', project_id)

        Tasks.schedule('backup-scheduler', tasks)

    def _schedule_project_backup_scheduler_for_custom_project_list(self):
        logging.info(
            u'Custom project list is defined. Only projects defined in configuration will be scheduled for backup'
        )
        self._schedule_project_backup_scheduler_tasks(
            self.custom_projects_list)