示例#1
0
def populate_achilles(hpo_bucket, hpo_id=FAKE_HPO_ID, include_heel=True):
    from validation import achilles, achilles_heel
    import app_identity

    app_id = app_identity.get_application_id()

    test_file_name = achilles.ACHILLES_ANALYSIS + '.csv'
    achilles_analysis_file_path = os.path.join(TEST_DATA_EXPORT_PATH,
                                               test_file_name)
    schema_name = achilles.ACHILLES_ANALYSIS
    write_cloud_file(hpo_bucket, achilles_analysis_file_path)
    gcs_path = 'gs://' + hpo_bucket + '/' + test_file_name
    dataset_id = bq_utils.get_dataset_id()
    table_id = bq_utils.get_table_id(hpo_id, achilles.ACHILLES_ANALYSIS)
    bq_utils.load_csv(schema_name, gcs_path, app_id, dataset_id, table_id)

    table_names = [achilles.ACHILLES_RESULTS, achilles.ACHILLES_RESULTS_DIST]
    if include_heel:
        table_names.append(achilles_heel.ACHILLES_HEEL_RESULTS)

    running_jobs = []
    for table_name in table_names:
        test_file_name = table_name + '.csv'
        test_file_path = os.path.join(TEST_DATA_EXPORT_SYNPUF_PATH,
                                      table_name + '.csv')
        write_cloud_file(hpo_bucket, test_file_path)
        gcs_path = 'gs://' + hpo_bucket + '/' + test_file_name
        dataset_id = bq_utils.get_dataset_id()
        table_id = bq_utils.get_table_id(hpo_id, table_name)
        load_results = bq_utils.load_csv(table_name, gcs_path, app_id,
                                         dataset_id, table_id)
        running_jobs.append(load_results['jobReference']['jobId'])
    bq_utils.wait_on_jobs(running_jobs)
示例#2
0
    def test_load_csv(self):
        from google.appengine.api import app_identity

        app_id = app_identity.get_application_id()
        table_name = 'achilles_analysis'
        schema_file_name = table_name + '.json'
        csv_file_name = table_name + '.csv'
        schema_path = os.path.join(resources.fields_path, schema_file_name)
        local_csv_path = os.path.join(test_util.TEST_DATA_EXPORT_PATH,
                                      csv_file_name)
        with open(local_csv_path, 'r') as fp:
            response = gcs_utils.upload_object(self.hpo_bucket, csv_file_name,
                                               fp)
        hpo_bucket = self.hpo_bucket
        gcs_object_path = 'gs://%(hpo_bucket)s/%(csv_file_name)s' % locals()
        dataset_id = bq_utils.get_dataset_id()
        load_results = bq_utils.load_csv(schema_path, gcs_object_path, app_id,
                                         dataset_id, table_name)

        load_job_id = load_results['jobReference']['jobId']
        incomplete_jobs = bq_utils.wait_on_jobs([load_job_id])
        self.assertEqual(len(incomplete_jobs), 0,
                         'loading table {} timed out'.format(table_name))
        query_response = bq_utils.query('SELECT COUNT(1) FROM %(table_name)s' %
                                        locals())
        self.assertEqual(query_response['kind'], 'bigquery#queryResponse')
    def load_test_data(self, hpo_id: str = None):
        """
        Load to bq test achilles heel results data from csv file

        :param hpo_id: if specified, prefix to use on csv test file and bq table, otherwise no prefix is used
        :return: contents of the file as list of objects
        """

        table_name: str = common.ACHILLES_HEEL_RESULTS
        if hpo_id is not None:
            table_id: str = bq_utils.get_table_id(hpo_id, table_name)
        else:
            table_id: str = table_name
        test_file_name: str = f'{table_id}.csv'
        test_file_path: str = os.path.join(test_util.TEST_DATA_PATH,
                                           test_file_name)

        target_bucket = self.storage_client.get_bucket(self.bucket)
        test_blob = target_bucket.blob(test_file_name)
        test_blob.upload_from_filename(test_file_path)

        gcs_path: str = f'gs://{self.bucket}/{test_file_name}'
        load_results = bq_utils.load_csv(table_name, gcs_path, self.project_id,
                                         self.dataset_id, table_id)
        job_id = load_results['jobReference']['jobId']
        bq_utils.wait_on_jobs([job_id])
        return resources.csv_to_list(test_file_path)
 def load_dataset_from_files(dataset_id, path):
     app_id = bq_utils.app_identity.get_application_id()
     bucket = gcs_utils.get_hpo_bucket(test_util.FAKE_HPO_ID)
     test_util.empty_bucket(bucket)
     job_ids = []
     for table in common.CDM_TABLES:
         filename = table + '.csv'
         schema = os.path.join(resources.fields_path, table + '.json')
         f = os.path.join(path, filename)
         if os.path.exists(os.path.join(path, filename)):
             with open(f, 'r') as fp:
                 gcs_utils.upload_object(bucket, filename, fp)
         else:
             test_util.write_cloud_str(bucket, filename, '\n')
         gcs_path = 'gs://{bucket}/{filename}'.format(bucket=bucket,
                                                      filename=filename)
         load_results = bq_utils.load_csv(schema,
                                          gcs_path,
                                          app_id,
                                          dataset_id,
                                          table,
                                          allow_jagged_rows=True)
         load_job_id = load_results['jobReference']['jobId']
         job_ids.append(load_job_id)
     incomplete_jobs = bq_utils.wait_on_jobs(job_ids)
     if len(incomplete_jobs) > 0:
         message = "Job id(s) %s failed to complete" % incomplete_jobs
         raise RuntimeError(message)
     test_util.empty_bucket(bucket)
示例#5
0
 def _upload_file_to_bucket(bucket, dataset_id, path, table):
     app_id = bq_utils.app_identity.get_application_id()
     filename = table + '.csv'
     schema = os.path.join(resources.fields_path, table + '.json')
     f = os.path.join(path, filename)
     if os.path.exists(os.path.join(path, filename)):
         with open(f, 'r') as fp:
             gcs_utils.upload_object(bucket, filename, fp)
     else:
         test_util.write_cloud_str(bucket, filename, '\n')
     gcs_path = 'gs://{bucket}/{filename}'.format(bucket=bucket, filename=filename)
     load_results = bq_utils.load_csv(schema, gcs_path, app_id, dataset_id, table, allow_jagged_rows=True)
     load_job_id = load_results['jobReference']['jobId']
     return load_job_id
示例#6
0
    def _upload_file_to_bucket(bucket, dataset_id, path, table):
        app_id = bq_utils.app_identity.get_application_id()
        filename = table + '.csv'

        file_path = os.path.join(path, filename)
        try:
            with open(file_path, 'rb') as filepath:
                gcs_utils.upload_object(bucket, filename, filepath)
        except OSError:
            test_util.write_cloud_str(bucket, filename, '\n')

        gcs_path = 'gs://{bucket}/{filename}'.format(bucket=bucket,
                                                     filename=filename)
        load_results = bq_utils.load_csv(table,
                                         gcs_path,
                                         app_id,
                                         dataset_id,
                                         table,
                                         allow_jagged_rows=True)
        load_job_id = load_results['jobReference']['jobId']
        return load_job_id
    def load_test_data(self, hpo_id=None):
        """
        Load to bq test achilles heel results data from csv file

        :param hpo_id: if specified, prefix to use on csv test file and bq table, otherwise no prefix is used
        :return: contents of the file as list of objects
        """
        schema_path = os.path.join(resources.fields_path,
                                   common.ACHILLES_HEEL_RESULTS + '.json')
        table_id = common.ACHILLES_HEEL_RESULTS
        if hpo_id is not None:
            table_id = bq_utils.get_table_id(hpo_id,
                                             common.ACHILLES_HEEL_RESULTS)
        test_file_name = table_id + '.csv'
        test_file_path = os.path.join(test_util.TEST_DATA_PATH, test_file_name)
        test_util.write_cloud_file(self.bucket, test_file_path)
        gcs_path = 'gs://' + self.bucket + '/' + test_file_name
        load_results = bq_utils.load_csv(schema_path, gcs_path, self.app_id,
                                         self.dataset_id, table_id)
        job_id = load_results['jobReference']['jobId']
        bq_utils.wait_on_jobs([job_id])
        return resources._csv_to_list(test_file_path)
示例#8
0
def populate_achilles(hpo_id=FAKE_HPO_ID, include_heel=True):
    from validation import achilles, achilles_heel
    import app_identity

    app_id = app_identity.get_application_id()
    test_resources_bucket = RESOURCES_BUCKET_FMT.format(project_id=app_id)
    table_names = [
        achilles.ACHILLES_ANALYSIS, achilles.ACHILLES_RESULTS,
        achilles.ACHILLES_RESULTS_DIST
    ]
    if include_heel:
        table_names.append(achilles_heel.ACHILLES_HEEL_RESULTS)

    running_jobs = []
    for table_name in table_names:
        gcs_path = f'gs://{test_resources_bucket}/{table_name}.csv'
        dataset_id = bq_utils.get_dataset_id()
        table_id = bq_utils.get_table_id(hpo_id, table_name)
        load_results = bq_utils.load_csv(table_name, gcs_path, app_id,
                                         dataset_id, table_id)
        running_jobs.append(load_results['jobReference']['jobId'])
    bq_utils.wait_on_jobs(running_jobs)
示例#9
0
 def _upload_file_to_bucket(cls, bucket: str, dataset_id: str, path: str,
                            table: str):
     app_id: str = bq_utils.app_identity.get_application_id()
     filename: str = f'{table}.csv'
     file_path: str = os.path.join(path, filename)
     target_bucket = cls.storage_client.get_bucket(bucket)
     blob = target_bucket.blob(filename)
     try:
         with open(file_path, 'rb') as filepath:
             blob.upload_from_file(filepath)
     except OSError as exc:
         blob.upload_from_string('\n')
     gcs_path: str = 'gs://{bucket}/{filename}'.format(bucket=bucket,
                                                       filename=filename)
     load_results = bq_utils.load_csv(table,
                                      gcs_path,
                                      app_id,
                                      dataset_id,
                                      table,
                                      allow_jagged_rows=True)
     load_job_id = load_results['jobReference']['jobId']
     return load_job_id
示例#10
0
    def test_load_csv(self):
        app_id = app_identity.get_application_id()
        table_name = 'achilles_analysis'
        csv_file_name = table_name + '.csv'
        local_csv_path = os.path.join(test_util.TEST_DATA_EXPORT_PATH,
                                      csv_file_name)
        sc_bucket = self.client.get_bucket(self.hpo_bucket)
        bucket_blob = sc_bucket.blob(csv_file_name)
        with open(local_csv_path, 'rb') as fp:
            bucket_blob.upload_from_file(fp)
        hpo_bucket = self.hpo_bucket
        gcs_object_path = 'gs://%(hpo_bucket)s/%(csv_file_name)s' % locals()
        dataset_id = self.dataset_id
        load_results = bq_utils.load_csv(table_name, gcs_object_path, app_id,
                                         dataset_id, table_name)

        load_job_id = load_results['jobReference']['jobId']
        incomplete_jobs = bq_utils.wait_on_jobs([load_job_id])
        self.assertEqual(len(incomplete_jobs), 0,
                         'loading table {} timed out'.format(table_name))
        query_response = bq_utils.query('SELECT COUNT(1) FROM %(table_name)s' %
                                        locals())
        self.assertEqual(query_response['kind'], 'bigquery#queryResponse')
示例#11
0
def write_to_result_table(project, dataset, site, match_values):
    """
    Append items in match_values to the table generated from site name.

    Attempts to limit the insert query to less than 1MB.

    :param site:  string identifier for the hpo site.
    :param match_values:  dictionary of person_ids and match values for a field
    :param project:  the project BigQuery project name
    :param dataset:  name of the dataset containing the table to append to

    :return: query results value
    :raises:  oauth2client.client.HttpAccessTokenRefreshError,
              googleapiclient.errors.HttpError
    """
    if not match_values:
        LOGGER.info(f"No values to insert for site: {site}")
        return None

    result_table = site + consts.VALIDATION_TABLE_SUFFIX
    bucket = gcs_utils.get_drc_bucket()
    path = dataset + '/intermediate_results/' + site + '.csv'

    field_list = [consts.PERSON_ID_FIELD]
    field_list.extend(consts.VALIDATION_FIELDS)
    field_list.append(consts.ALGORITHM_FIELD)

    results = StringIO()
    field_list_str = ','.join(field_list) + '\n'
    results.write(field_list_str)

    LOGGER.info(f"Generating csv values to write to storage for site: {site}")

    for person_key, person_values in match_values.items():
        str_list = [str(person_key)]
        for field in consts.VALIDATION_FIELDS:
            value = str(person_values.get(field, consts.MISSING))
            str_list.append(value)

        str_list.append(consts.YES)
        val_str = ','.join(str_list)
        results.write(val_str + '\n')

    LOGGER.info(f"Writing csv file to cloud storage for site: {site}")

    # write results
    results.seek(0)
    gcs_utils.upload_object(bucket, path, results)
    results.close()

    LOGGER.info(
        f"Wrote {len(match_values)} items to cloud storage for site: {site}")

    # wait on results to be written

    schema_path = os.path.join(fields_path, 'identity_match.json')

    LOGGER.info(
        f"Beginning load of identity match values from csv into BigQuery "
        "for site: {site}")
    try:
        # load csv file into bigquery
        results = bq_utils.load_csv(schema_path,
                                    'gs://' + bucket + '/' + path,
                                    project,
                                    dataset,
                                    result_table,
                                    write_disposition=consts.WRITE_TRUNCATE)

        # ensure the load job finishes
        query_job_id = results['jobReference']['jobId']
        incomplete_jobs = bq_utils.wait_on_jobs([query_job_id])
        if incomplete_jobs != []:
            raise bq_utils.BigQueryJobWaitError(incomplete_jobs)

    except (oauth2client.client.HttpAccessTokenRefreshError,
            googleapiclient.errors.HttpError):
        LOGGER.exception(
            f"Encountered an exception when loading records from csv for site: {site}"
        )
        raise

    LOGGER.info(f"Loaded match values for site: {site}")

    return results