示例#1
0
def schema_upgrade_cdm52_to_cdm531(project_id,
                                   dataset_id,
                                   snapshot_dataset_id,
                                   hpo_id=None):
    """
   :param project_id:
   :param dataset_id: Dataset to convert
   :param snapshot_dataset_id: Dataset with converted tables. Overwritten if tables already exist
   :param hpo_id: Identifies the hpo_id of the site
   :return:
    """
    # Create dataset if not exists
    client = bq.get_client(project_id)
    client.create_dataset(snapshot_dataset_id, exists_ok=True)

    sq.create_empty_cdm_tables(snapshot_dataset_id, hpo_id)

    copy_table_job_ids = []
    tables = [table.table_id for table in list(client.list_tables(dataset_id))]
    if hpo_id:
        hpo_tables = [
            resources.get_table_id(table, hpo_id)
            for table in resources.CDM_TABLES + PII_TABLES
        ]
        # Filter tables that do not exist
        tables = [table for table in hpo_tables if table in tables]
    for table_id in tables:
        q = get_upgrade_table_query(client, dataset_id, table_id, hpo_id)
        job_config = QueryJobConfig()
        job_config.destination = f'{client.project}.{snapshot_dataset_id}.{table_id}'
        job_config.use_legacy_sql = False
        job = client.query(q, job_config)
        copy_table_job_ids.append(job.job_id)
        job.result()
    return copy_table_job_ids
示例#2
0
 def __create_job_config(
         self, ems_query_job_config: EmsQueryJobConfig) -> QueryJobConfig:
     job_config = QueryJobConfig()
     job_config.priority = ems_query_job_config.priority.value
     job_config.use_legacy_sql = False
     job_config.use_query_cache = ems_query_job_config.use_query_cache
     job_config.labels = ems_query_job_config.labels
     if ems_query_job_config.destination_table is not None:
         job_config.time_partitioning = TimePartitioning("DAY")
         table_reference = TableReference(
             DatasetReference(
                 ems_query_job_config.destination_project_id
                 or self.__project_id,
                 ems_query_job_config.destination_dataset),
             ems_query_job_config.destination_table)
         job_config.destination = table_reference
         job_config.write_disposition = ems_query_job_config.write_disposition.value
         job_config.create_disposition = ems_query_job_config.create_disposition.value
     partitioning = ems_query_job_config.time_partitioning
     if partitioning is not None:
         job_config.time_partitioning = TimePartitioning(
             partitioning.type.value, partitioning.field,
             partitioning.expiration_ms,
             partitioning.require_partition_filter)
     if ems_query_job_config.table_definitions is not None:
         job_config.table_definitions = ems_query_job_config.table_definitions
     return job_config
示例#3
0
from datetime import datetime, timedelta, date
from google.cloud import bigquery
from google.cloud.bigquery.table import RowIterator
from google.cloud.bigquery import QueryJobConfig

billing_project = "momovn-dev"
conf = QueryJobConfig()
conf.use_query_cache = True
conf.use_legacy_sql = False
checkpointDate = None
start_date = datetime.strptime('20201002', '%Y%m%d').date()
end_date = datetime.strptime('20201002', '%Y%m%d').date()
day_count = (end_date - start_date).days + 1

for checkpointDate in (start_date + timedelta(n) for n in range(day_count)):
    try:
        # checkpointDate = datetime.strptime(single_date, '%Y%m%d').date()
        checkpointDateWithoutDash = checkpointDate.strftime("%Y%m%d")
        checkpointDateWithDash = checkpointDate.strftime("%Y-%m-%d")

        query = f"""WITH A AS( SELECT GPS.reference PHONE FROM `momovn-prod.HERMES.HERMES_LOCATIONS` GPS WHERE DATE(GPS.event_timestamp,'Asia/Bangkok') = {checkpointDateWithDash})
        SELECT COUNT(DISTINCT T1.USER_ID), 'HERMES LOCATION'
        FROM `momovn-prod.BITEAM_INTERN.{checkpointDateWithoutDash}_CHECK_LOCATION` T1 
        LEFT JOIN A T2 
        ON T1.USER_ID = T2.PHONE 
        WHERE T2.PHONE IS NULL
        UNION ALL
        SELECT COUNT(DISTINCT T1.USER_ID), 'USER_LOCATION'
        FROM `momovn-prod.BITEAM_INTERN.{checkpointDateWithoutDash}_CHECK_LOCATION` T1 
        LEFT JOIN `momovn-prod.HERMES.USER_LOCATIONS_{checkpointDateWithoutDash}` T2 
        ON T1.USER_ID = T2.USER_ID 
示例#4
0
文件: fquery.py 项目: nmatare/putils
 def _config_query(self, use_legacy_sql):
     job_config = QueryJobConfig()
     job_config.destination = self.temp_table
     job_config.use_legacy_sql = use_legacy_sql
     job_config.allow_large_results = True
     return job_config
示例#5
0
def load_from_github(repository_name):
    """
    Load repository information from Github such as pull requests and contributors
    :param repository_name:
    :return:
    """
    logger.debug(datetime.datetime.now().strftime("%H:%M:%S") +
                 "Loading from github")
    repo_api = get_repo_api(repository_name)

    try:
        full_name = repo_api.full_name
    except UnknownObjectException:
        return None
    repo = Repository(id=repo_api.id, full_name=full_name, name=repo_api.name)
    repo.language, created = Language.objects.get_or_create(
        name=repo_api.language)
    repo.save()
    logger.debug(datetime.datetime.now().strftime("%H:%M:%S") +
                 "Getting contributors")
    contributor_counter = len(list(repo_api.get_contributors()))
    repo.contributors_count = contributor_counter

    if config.GET_CONTRIBUTORS_DATA:
        for contrib in repo_api.get_contributors():
            contributor_counter += 1
            try:
                contributor_db = Contributor.objects.get(
                    login__exact=contrib.login)
            except ObjectDoesNotExist:
                contributor_db = Contributor()
                contributor_db.login = contrib.login
                contributor_db.followers_count = contrib.followers
                contributor_db.url = contrib.html_url
                contributor_db.save()

            contribution_db = Contribution(repository=repo,
                                           contributor=contributor_db,
                                           amount=contrib.contributions)
            contribution_db.save()

    logger.debug(datetime.datetime.now().strftime("%H:%M:%S") +
                 "Getting pull request Data")

    if config.USE_BIGQUERY:
        bigquery_client: bigquery.Client = bigquery.Client.from_service_account_json(
            "socialpatterns-c03d755a739c.json")
        repo_url_bigquery = repo_api.html_url.replace("github.com",
                                                      "api.github.com/repos")
        query_config = QueryJobConfig()
        query_config.use_legacy_sql = False
        query_text = """ SELECT Count(*) AS Pull_Request , (SELECT Count(*) FROM `ghtorrent-bq.ght_2018_04_01.issue_comments`        WHERE  issue_id IN (SELECT id  FROM   `ghtorrent-bq.ght_2018_04_01.issues`  WHERE  pull_request_id IN (SELECT id FROM   `ghtorrent-bq.ght_2018_04_01.pull_requests` WHERE   base_repo_id = (SELECT id FROM   `ghtorrent-bq.ght_2018_04_01.projects` AS pj WHERE  pj.url ="%s" LIMIT 1  ))   )) AS Comments  FROM   `ghtorrent-bq.ght_2018_04_01.pull_requests` WHERE  base_repo_id =   (SELECT id   FROM   `ghtorrent-bq.ght_2018_04_01.projects` AS pj   WHERE  pj.url="%s" LIMIT 1   )  """ % (
            repo_url_bigquery, repo_url_bigquery)
        query_job = bigquery_client.query(query_text, job_config=query_config)
        pr_number = list(query_job.result())[0][0]
        comments = list(query_job.result())[0][1]
    else:
        if config.CHECK_CLOSED_PR:
            pull_requests = repo_api.get_pulls(state="all")
        else:
            pull_requests = repo_api.get_pulls()

        pr_number = len(list(pull_requests))
        comments = 0

        for pr in pull_requests:
            try:
                comments += pr.comments
            except ssl.SSLError:
                logger.error("Read timeout when getting comments")
    repo.comments_count = comments
    repo.pull_request_count = pr_number
    repo.save()
    return repo