def test_run_export(self): folder_prefix = 'dummy-prefix-2018-03-24/' main._upload_achilles_files(test_util.FAKE_HPO_ID, folder_prefix) main.run_export(hpo_id=test_util.FAKE_HPO_ID, folder_prefix=folder_prefix) bucket_objects = gcs_utils.list_bucket(self.hpo_bucket) actual_object_names = [obj['name'] for obj in bucket_objects] for report in common.ALL_REPORT_FILES: prefix = folder_prefix + common.ACHILLES_EXPORT_PREFIX_STRING + test_util.FAKE_HPO_ID + '/' expected_object_name = prefix + report self.assertIn(expected_object_name, actual_object_names) datasources_json_path = folder_prefix + common.ACHILLES_EXPORT_DATASOURCES_JSON self.assertIn(datasources_json_path, actual_object_names) datasources_json = gcs_utils.get_object(self.hpo_bucket, datasources_json_path) datasources_actual = json.loads(datasources_json) datasources_expected = { 'datasources': [{ 'name': test_util.FAKE_HPO_ID, 'folder': test_util.FAKE_HPO_ID, 'cdmVersion': 5 }] } self.assertDictEqual(datasources_expected, datasources_actual)
def get_full_result_log(): full_log = [] for hpo in resources.hpo_csv(): hpo_id = hpo['hpo_id'] hpo_bucket = gcs_utils.get_hpo_bucket(hpo_id) try: # TODO : figure out possible errors and catch specific bucket inexistence error obj_metadata = gcs_utils.get_metadata(hpo_bucket, RESULT_CSV) except: logging.warning( 'skipping hpo {}. bucket does not exist.'.format(hpo)) continue if obj_metadata is None: logging.info('%s was not found in %s' % (RESULT_CSV, hpo_bucket)) else: hpo_result = gcs_utils.get_object(hpo_bucket, RESULT_CSV) hpo_result_file = StringIO.StringIO(hpo_result) hpo_result_items = resources._csv_file_to_list(hpo_result_file) result_objects = map( lambda item: hpo_log_item_to_obj(hpo_id, item), hpo_result_items) full_log.extend(result_objects) return full_log
def test_get_object(self): with open(FIVE_PERSONS_PERSON_CSV, 'r') as fp: expected = fp.read() with open(FIVE_PERSONS_PERSON_CSV, 'rb') as fp: gcs_utils.upload_object(self.hpo_bucket, 'person.csv', fp) result = gcs_utils.get_object(self.hpo_bucket, 'person.csv') self.assertEqual(expected, result)
def test_run_export_with_target_bucket(self): folder_prefix = 'dummy-prefix-2018-03-24/' bucket_nyc = gcs_utils.get_hpo_bucket('nyc') test_util.get_synpuf_results_files() test_util.populate_achilles(self.hpo_bucket, hpo_id=None) main.run_export(folder_prefix=folder_prefix, target_bucket=bucket_nyc) bucket_objects = gcs_utils.list_bucket(bucket_nyc) actual_object_names = [obj['name'] for obj in bucket_objects] for report in common.ALL_REPORT_FILES: expected_object_name = folder_prefix + common.ACHILLES_EXPORT_PREFIX_STRING + 'default' + '/' + report self.assertIn(expected_object_name, actual_object_names) datasources_json_path = folder_prefix + common.ACHILLES_EXPORT_DATASOURCES_JSON self.assertIn(datasources_json_path, actual_object_names) datasources_json = gcs_utils.get_object(bucket_nyc, datasources_json_path) datasources_actual = json.loads(datasources_json) datasources_expected = { 'datasources': [{ 'name': 'default', 'folder': 'default', 'cdmVersion': 5 }] } self.assertDictEqual(datasources_expected, datasources_actual)
def test_run_export_with_target_bucket_and_datasource_id( self, mock_is_hpo_id): # validation/main.py INTEGRATION TEST mock_is_hpo_id.return_value = True folder_prefix = 'dummy-prefix-2018-03-24/' bucket_nyc = gcs_utils.get_hpo_bucket('nyc') main.run_export(datasource_id=FAKE_HPO_ID, folder_prefix=folder_prefix, target_bucket=bucket_nyc) bucket_objects = gcs_utils.list_bucket(bucket_nyc) actual_object_names = [obj['name'] for obj in bucket_objects] for report in common.ALL_REPORT_FILES: prefix = folder_prefix + common.ACHILLES_EXPORT_PREFIX_STRING + FAKE_HPO_ID + '/' expected_object_name = prefix + report self.assertIn(expected_object_name, actual_object_names) datasources_json_path = folder_prefix + common.ACHILLES_EXPORT_DATASOURCES_JSON self.assertIn(datasources_json_path, actual_object_names) datasources_json = gcs_utils.get_object(bucket_nyc, datasources_json_path) datasources_actual = json.loads(datasources_json) datasources_expected = { 'datasources': [{ 'name': FAKE_HPO_ID, 'folder': FAKE_HPO_ID, 'cdmVersion': 5 }] } self.assertDictEqual(datasources_expected, datasources_actual)
def retract(pids, bucket, found_files, folder_prefix, force_flag): """ Retract from a folder in a GCS bucket all records associated with a pid :param pids: person_ids to retract :param bucket: bucket containing records to retract :param found_files: files found in the current folder :param folder_prefix: current folder being processed :param force_flag: if False then prompt for each file :return: metadata for each object updated in order to retract """ result_list = [] for file_name in found_files: table_name = file_name.split(".")[0] lines_removed = 0 if force_flag: logger.debug("Attempting to force retract for person_ids %s in path %s/%s%s" % (pids, bucket, folder_prefix, file_name)) response = "Y" else: # Make sure user types Y to proceed logger.debug("Are you sure you want to retract rows for person_ids %s from path %s/%s%s?" % (pids, bucket, folder_prefix, file_name)) response = get_response() if response == "Y": # Output and input file content initialization retracted_file_string = StringIO.StringIO() input_file_string = gcs_utils.get_object(bucket, folder_prefix + file_name) input_contents = input_file_string.split('\n') modified_flag = False logger.debug("Checking for person_ids %s in path %s/%s%s" % (pids, bucket, folder_prefix, file_name)) # Check if file has person_id in first or second column for input_line in input_contents: if input_line != '': if (table_name in PID_IN_COL1 and get_integer(input_line.split(",")[0]) in pids) or \ (table_name in PID_IN_COL2 and get_integer(input_line.split(",")[1]) in pids): lines_removed += 1 modified_flag = True else: retracted_file_string.write(input_line + '\n') # Write result back to bucket if modified_flag: logger.debug("Retracted %d rows from %s/%s%s" % (lines_removed, bucket, folder_prefix, file_name)) logger.debug("Overwriting file %s/%s%s" % (bucket, folder_prefix, file_name)) upload_result = gcs_utils.upload_object(bucket, folder_prefix + file_name, retracted_file_string) result_list.append(upload_result) logger.debug("Retraction successful for file %s/%s%s " % (bucket, folder_prefix, file_name)) else: logger.debug("Skipping file %s/%s%s since pids %s not found" % (bucket, folder_prefix, file_name, pids)) elif response.lower() == "n": logger.debug("Ignoring file %s" % file_name) return result_list
def all_required_files_loaded(hpo_id, folder_prefix): result_file = gcs_utils.get_object(gcs_utils.get_hpo_bucket(hpo_id), folder_prefix + common.RESULT_CSV) result_file = StringIO.StringIO(result_file) result_items = resources._csv_file_to_list(result_file) for item in result_items: if item['file_name'] in common.REQUIRED_FILES: if item['loaded'] != '1': return False return True
def retract(pid, bucket, found_files, folder_prefix, force): """ Retract from a folder in a GCS bucket all records associated with a pid :param pid: person_id :param bucket: bucket containing records to retract :param found_files: files found in the current folder :param folder_prefix: current folder being processed :param force: if False then prompt for each file :return: metadata for each object updated in order to retract """ result_list = [] for file_name in found_files: if force: print("Force retracting rows for person_id %s from path %s/%s%s" % (pid, bucket, folder_prefix, file_name)) response = "Y" else: # Make sure user types Y to proceed print( "Are you sure you want to retract rows for person_id %s from path %s/%s%s?" % (pid, bucket, folder_prefix, file_name)) response = get_response() if response == "Y": # Output and input file content initialization retracted_file_string = StringIO.StringIO() input_file_string = gcs_utils.get_object(bucket, folder_prefix + file_name) input_contents = input_file_string.split('\n') modified_flag = False # Check if file has person_id in first or second column for input_line in input_contents: if input_line != '': if (file_name in PID_IN_COL1 and get_integer(input_line.split(",")[0]) != pid) or \ (file_name in PID_IN_COL2 and get_integer(input_line.split(",")[1]) != pid): retracted_file_string.write(input_line + '\n') else: modified_flag = True # TODO: return number of lines removed, message if no file in the folder was updated # Write result back to bucket if modified_flag: print("Overwriting file %s/%s%s" % (bucket, folder_prefix, file_name)) upload_result = gcs_utils.upload_object( bucket, folder_prefix + file_name, retracted_file_string) result_list.append(upload_result) else: print("Skipping file %s/%s%s since pid %s not found" % (bucket, folder_prefix, file_name, pid)) elif response.lower() == "n": print("Ignoring file %s" % file_name) return result_list
def read_cloud_file(bucket, name): return gcs_utils.get_object(bucket, name)
def retract(pids, bucket, found_files, folder_prefix, force_flag): """ Retract from a folder in a GCS bucket all records associated with a pid pid table must follow schema described in retract_data_bq.PID_TABLE_FIELDS and must reside in sandbox_dataset_id This function removes lines from all files containing person_ids if they exist in pid_table_id Throws SyntaxError/TypeError/ValueError if non-ints are found :param pids: person_ids to retract :param bucket: bucket containing records to retract :param found_files: files found in the current folder :param folder_prefix: current folder being processed :param force_flag: if False then prompt for each file :return: metadata for each object updated in order to retract """ result_list = [] for file_name in found_files: table_name = file_name.split(".")[0] lines_removed = 0 file_gcs_path = '%s/%s%s' % (bucket, folder_prefix, file_name) if force_flag: logger.info( "Attempting to force retract for person_ids %s in path %s/%s%s" % (pids, bucket, folder_prefix, file_name)) response = "Y" else: # Make sure user types Y to proceed logger.info( "Are you sure you want to retract rows for person_ids %s from path %s/%s%s?" % (pids, bucket, folder_prefix, file_name)) response = get_response() if response == "Y": # Output and input file content initialization retracted_file_string = BytesIO() input_file_bytes = gcs_utils.get_object(bucket, folder_prefix + file_name, as_text=False) input_file_lines = input_file_bytes.split(b'\n') input_header = input_file_lines[0] input_contents = input_file_lines[1:] retracted_file_string.write(input_header + b'\n') logger.info("Checking for person_ids %s in path %s" % (pids, file_gcs_path)) # Check if file has person_id in first or second column for input_line in input_contents: input_line = input_line.strip() # ensure line is not empty if input_line: cols = input_line.split(b',') # ensure at least two columns exist if len(cols) > 1: col_1 = cols[0] col_2 = cols[1] # skip if non-integer is encountered and keep the line as is try: if (table_name in PID_IN_COL1 and int(col_1) in pids) or \ (table_name in PID_IN_COL2 and int(col_2) in pids): # do not write back this line since it contains a pid to retract # increment removed lines counter lines_removed += 1 else: # pid not found, retain this line retracted_file_string.write(input_line + b'\n') except ValueError: # write back non-num lines retracted_file_string.write(input_line + b'\n') else: # write back ill-formed lines. Note: These lines do not make it into BigQuery retracted_file_string.write(input_line + b'\n') # Write result back to bucket if lines_removed > 0: logger.info("%d rows retracted from %s, overwriting..." % (lines_removed, file_gcs_path)) upload_result = gcs_utils.upload_object( bucket, folder_prefix + file_name, retracted_file_string) result_list.append(upload_result) logger.info("Retraction successful for file %s" % file_gcs_path) else: logger.info("Not updating file %s since pids %s not found" % (file_gcs_path, pids)) elif response.lower() == "n": logger.info("Skipping file %s" % file_gcs_path) return result_list