def schema_upgrade_cdm52_to_cdm531(project_id, dataset_id, snapshot_dataset_id, hpo_id=None): """ :param project_id: :param dataset_id: Dataset to convert :param snapshot_dataset_id: Dataset with converted tables. Overwritten if tables already exist :param hpo_id: Identifies the hpo_id of the site :return: """ # Create dataset if not exists client = bq.get_client(project_id) client.create_dataset(snapshot_dataset_id, exists_ok=True) sq.create_empty_cdm_tables(snapshot_dataset_id, hpo_id) copy_table_job_ids = [] tables = [table.table_id for table in list(client.list_tables(dataset_id))] if hpo_id: hpo_tables = [ resources.get_table_id(table, hpo_id) for table in resources.CDM_TABLES + PII_TABLES ] # Filter tables that do not exist tables = [table for table in hpo_tables if table in tables] for table_id in tables: q = get_upgrade_table_query(client, dataset_id, table_id, hpo_id) job_config = QueryJobConfig() job_config.destination = f'{client.project}.{snapshot_dataset_id}.{table_id}' job_config.use_legacy_sql = False job = client.query(q, job_config) copy_table_job_ids.append(job.job_id) job.result() return copy_table_job_ids
def __create_job_config( self, ems_query_job_config: EmsQueryJobConfig) -> QueryJobConfig: job_config = QueryJobConfig() job_config.priority = ems_query_job_config.priority.value job_config.use_legacy_sql = False job_config.use_query_cache = ems_query_job_config.use_query_cache job_config.labels = ems_query_job_config.labels if ems_query_job_config.destination_table is not None: job_config.time_partitioning = TimePartitioning("DAY") table_reference = TableReference( DatasetReference( ems_query_job_config.destination_project_id or self.__project_id, ems_query_job_config.destination_dataset), ems_query_job_config.destination_table) job_config.destination = table_reference job_config.write_disposition = ems_query_job_config.write_disposition.value job_config.create_disposition = ems_query_job_config.create_disposition.value partitioning = ems_query_job_config.time_partitioning if partitioning is not None: job_config.time_partitioning = TimePartitioning( partitioning.type.value, partitioning.field, partitioning.expiration_ms, partitioning.require_partition_filter) if ems_query_job_config.table_definitions is not None: job_config.table_definitions = ems_query_job_config.table_definitions return job_config
from datetime import datetime, timedelta, date from google.cloud import bigquery from google.cloud.bigquery.table import RowIterator from google.cloud.bigquery import QueryJobConfig billing_project = "momovn-dev" conf = QueryJobConfig() conf.use_query_cache = True conf.use_legacy_sql = False checkpointDate = None start_date = datetime.strptime('20201002', '%Y%m%d').date() end_date = datetime.strptime('20201002', '%Y%m%d').date() day_count = (end_date - start_date).days + 1 for checkpointDate in (start_date + timedelta(n) for n in range(day_count)): try: # checkpointDate = datetime.strptime(single_date, '%Y%m%d').date() checkpointDateWithoutDash = checkpointDate.strftime("%Y%m%d") checkpointDateWithDash = checkpointDate.strftime("%Y-%m-%d") query = f"""WITH A AS( SELECT GPS.reference PHONE FROM `momovn-prod.HERMES.HERMES_LOCATIONS` GPS WHERE DATE(GPS.event_timestamp,'Asia/Bangkok') = {checkpointDateWithDash}) SELECT COUNT(DISTINCT T1.USER_ID), 'HERMES LOCATION' FROM `momovn-prod.BITEAM_INTERN.{checkpointDateWithoutDash}_CHECK_LOCATION` T1 LEFT JOIN A T2 ON T1.USER_ID = T2.PHONE WHERE T2.PHONE IS NULL UNION ALL SELECT COUNT(DISTINCT T1.USER_ID), 'USER_LOCATION' FROM `momovn-prod.BITEAM_INTERN.{checkpointDateWithoutDash}_CHECK_LOCATION` T1 LEFT JOIN `momovn-prod.HERMES.USER_LOCATIONS_{checkpointDateWithoutDash}` T2 ON T1.USER_ID = T2.USER_ID
def _config_query(self, use_legacy_sql): job_config = QueryJobConfig() job_config.destination = self.temp_table job_config.use_legacy_sql = use_legacy_sql job_config.allow_large_results = True return job_config
def load_from_github(repository_name): """ Load repository information from Github such as pull requests and contributors :param repository_name: :return: """ logger.debug(datetime.datetime.now().strftime("%H:%M:%S") + "Loading from github") repo_api = get_repo_api(repository_name) try: full_name = repo_api.full_name except UnknownObjectException: return None repo = Repository(id=repo_api.id, full_name=full_name, name=repo_api.name) repo.language, created = Language.objects.get_or_create( name=repo_api.language) repo.save() logger.debug(datetime.datetime.now().strftime("%H:%M:%S") + "Getting contributors") contributor_counter = len(list(repo_api.get_contributors())) repo.contributors_count = contributor_counter if config.GET_CONTRIBUTORS_DATA: for contrib in repo_api.get_contributors(): contributor_counter += 1 try: contributor_db = Contributor.objects.get( login__exact=contrib.login) except ObjectDoesNotExist: contributor_db = Contributor() contributor_db.login = contrib.login contributor_db.followers_count = contrib.followers contributor_db.url = contrib.html_url contributor_db.save() contribution_db = Contribution(repository=repo, contributor=contributor_db, amount=contrib.contributions) contribution_db.save() logger.debug(datetime.datetime.now().strftime("%H:%M:%S") + "Getting pull request Data") if config.USE_BIGQUERY: bigquery_client: bigquery.Client = bigquery.Client.from_service_account_json( "socialpatterns-c03d755a739c.json") repo_url_bigquery = repo_api.html_url.replace("github.com", "api.github.com/repos") query_config = QueryJobConfig() query_config.use_legacy_sql = False query_text = """ SELECT Count(*) AS Pull_Request , (SELECT Count(*) FROM `ghtorrent-bq.ght_2018_04_01.issue_comments` WHERE issue_id IN (SELECT id FROM `ghtorrent-bq.ght_2018_04_01.issues` WHERE pull_request_id IN (SELECT id FROM `ghtorrent-bq.ght_2018_04_01.pull_requests` WHERE base_repo_id = (SELECT id FROM `ghtorrent-bq.ght_2018_04_01.projects` AS pj WHERE pj.url ="%s" LIMIT 1 )) )) AS Comments FROM `ghtorrent-bq.ght_2018_04_01.pull_requests` WHERE base_repo_id = (SELECT id FROM `ghtorrent-bq.ght_2018_04_01.projects` AS pj WHERE pj.url="%s" LIMIT 1 ) """ % ( repo_url_bigquery, repo_url_bigquery) query_job = bigquery_client.query(query_text, job_config=query_config) pr_number = list(query_job.result())[0][0] comments = list(query_job.result())[0][1] else: if config.CHECK_CLOSED_PR: pull_requests = repo_api.get_pulls(state="all") else: pull_requests = repo_api.get_pulls() pr_number = len(list(pull_requests)) comments = 0 for pr in pull_requests: try: comments += pr.comments except ssl.SSLError: logger.error("Read timeout when getting comments") repo.comments_count = comments repo.pull_request_count = pr_number repo.save() return repo