def get_objects_for_day(dt, source, delegation_token): # dt - date # source - specify the source # region - the region of the pulse data # bucket - the bucket name, e.g. "problems" # ads - list of ADs in the region, such as ['AD_1', 'AD_2', 'AD_3'] # delegation_token - the oci delegation_token src_region = source['region'] src_bucket = source['bucket'] src_ads = source['ads'] os_client = dfapp_get_os_client(src_region, delegation_token) obj_infos = [] for ad in src_ads: prefix = f"{src_region}_{ad}/{src_bucket}/{dt.year}/{dt.month}/{dt.day}/" for record in oci.pagination.list_call_get_all_results_generator( os_client.list_objects, 'record', PULSE_NAMESPACE, src_bucket, prefix=prefix, fields="name, size", ): obj_infos.append({ "name": record.name, "size": record.size, 'region': src_region, 'ad': ad }) return obj_infos
def main(spark, args): # dummy code to load dataframe Student = Row("id", "name") df = spark.createDataFrame( [Student(1, 'foo'), Student(3, 'bar'), Student(2, 'tar')]) df.show() # list objects for problems client = dfapp_get_os_client( spark, "https://objectstorage.us-ashburn-1.oraclecloud.com") # print("oci is imported") # conf = spark.sparkContext.getConf() # token_path = conf.get("spark.hadoop.fs.oci.client.auth.delegationTokenPath") # # read in token # with open(token_path) as fd: # delegation_token = fd.read() # print("delegation_token = {}".format(delegation_token)) # signer = oci.auth.signers.InstancePrincipalsDelegationTokenSigner(delegation_token=delegation_token) # print("signer created") # client = oci.object_storage.ObjectStorageClient( # {}, signer=signer, service_endpoint='https://objectstorage.us-ashburn-1.oraclecloud.com', timeout=60.0 # ) # print("object storage client created") r = client.list_objects("oci-pulse-prod", "problems") for object_sumary in r.data.objects: print(">> {}".format(object_sumary.name)) print("list object done")
def generate_parquet(spark, server_channel, dt, source, destination, partition_count, application_id, application_args): # spark - spark session # dt - date, for which day we are processing? # source - specify the source # bucket - the pulse bucket we are handling # destination # region - the region for saving the pulse data # bucket - the bucket for saving the pulse data # namespace - the namespace for saving the pulse data source_bucket = source['bucket'] dest_region = destination['region'] dest_namespace = destination['namespace'] dest_bucket = destination['bucket'] os_client = dfapp_get_os_client(dest_region, get_delegation_token(spark)) os_delete_objects(os_client, dest_namespace, dest_bucket, f"{dt}/") rpl = RawPulseDataLocations(destination) # save pulse data write_location = rpl.url_from_path(f"{dt}/raw.parquet") df = spark.read.json(rpl.url_from_path(f"__stage__/{dt}/raw/*")) df.coalesce(partition_count).write.partitionBy("_sys_region").parquet( write_location) data_time = datetime.combine(dt, datetime.min.time()) df = spark.read.parquet(write_location) # register it to data catalog dcc = DataCatalogClientProxy(server_channel) loader = Loader(dcc=dcc) sample_data = get_dataframe_sample_data(df) dsi = loader.register_asset( spark, f"daily_pulse_{source_bucket}_raw:1.0:1:/{dt}", 'hwd', 'parquet', write_location, df.count(), df.schema.jsonValue(), sample_data=sample_data, data_time=data_time, application_id=application_id, application_args=json.dumps(application_args), ) # save summary df = spark.read.json(rpl.url_from_path(f"__stage__/{dt}/summary/*")) # summary file is small, so it is ok to have 1 partition per region df.coalesce(1).write.partitionBy("region").parquet( rpl.url_from_path(f"{dt}/summary.parquet")) os_delete_objects(os_client, dest_namespace, dest_bucket, f"__stage__/{dt}/raw/") os_delete_objects(os_client, dest_namespace, dest_bucket, f"__stage__/{dt}/summary/")
def get_os_client_ex(spark, region): from oci_core import dfapp_get_os_client, get_delegation_token, get_os_client if USE_INSTANCE_PRINCIPLE: delegation_token = get_delegation_token(spark) os_client = dfapp_get_os_client(region, delegation_token) else: with tempfile.NamedTemporaryFile(mode='w+t', delete=False) as key_f: key_f.write(OCI_KEY) _oci_config = dict(OCI_CONFIG) _oci_config['key_file'] = key_f.name os_client = get_os_client(None, config=_oci_config) return os_client
def stage_objects(group_idx, group_creation_time, round, max_group_time, source_region, total_size, dt, obj_infos, delegation_token, destination, pulse_bucket_name): # group_idx - group index, each task has a group index # group_creation_time - when the group is created, datetime # round - the round number, integer # max_group_time - max time a group is allowed, timedelta # source_region - the pulse region # total_size - total raw pulse object size in bytes # dt - date, for which day we are processing? # obj_infos - list of dict which contain name, size, region and ad (for PULSE object names, object size, region and ad) # delegation_token - oci delegation token # destination # region - the region for saving the pulse data # bucket - the bucket for saving the pulse data # namespace - the namespace for saving the pulse data # pulse_bucket_name - Pulse bucket name current_dir = os.getcwd() begin_time = datetime.utcnow() debug_info = { "download_file_count": 0, # number of gzipped pulse raw file "download_duration": 0.0, # time to download gzipped pulse raw file "download_size": 0, # total bytes for gzipped pulse raw file downloaded "total_size": total_size, # total bytes for gzipped pulse raw file (form list objects api) # should match download_size "unzip_file_count": 0, # number of time we unzip gzipped raw pulse file "unzip_duration": 0.0, # time to unzip gzipped raw pulse file "load_json_count": 0, # number of json we try to load from unzipped pulse file "load_json_duration": 0.0, # time to load json "dump_json_line_duration": 0.0, # the time to dump a json object to a string in a line for the output "write_stage_file_duration": 0.0, # the time to write to staging file "gzip_output_duration": 0.0, # gzip output json for pulse data "upload_duration": 0.0, # time for upload to object storage, both raw json and summary json "upload_size": 0, # total bytes for upload "upload_file_count": 0, # number of files upload "total_duration": 0.0, # total duration for this task "unhandled_count": 0, # number of object_info that is skipped } rpl = RawPulseDataLocations(destination) stage_path = f"__stage__/{dt}/raw" summary_path = f"__stage__/{dt}/summary" dest_region = destination['region'] dest_namespace = destination['namespace'] dest_bucket = destination['bucket'] save_os_client = dfapp_get_os_client(dest_region, delegation_token) pulse_os_client = dfapp_get_os_client(source_region, delegation_token) summaries = [] local_stage_filename = os.path.join( current_dir, f"__{round}_{group_idx}-{pulse_bucket_name}-{dt}-stage__.json") local_stage_gz_filename = f"{local_stage_filename}.gz" if os.path.isfile(local_stage_filename): os.remove(local_stage_filename) if os.path.isfile(local_stage_gz_filename): os.remove(local_stage_gz_filename) unhandled_object_infos = [] with open(local_stage_filename, "a+t") as f: for obj_info_idx, obj_info in enumerate(obj_infos): now = datetime.utcnow() if (now - group_creation_time) >= max_group_time: unhandled_object_infos = obj_infos[obj_info_idx:] break object_name = obj_info['name'] filename_md5 = hashlib.md5(object_name.encode('utf-8')).hexdigest() rows, status = object_to_json(pulse_os_client, pulse_bucket_name, object_name, debug_info) for idx, row in enumerate(rows): row['_sys_region'] = obj_info['region'] row['_sys_filename_md5'] = filename_md5 row['_sys_idx'] = idx t1 = datetime.utcnow() line = f"{json.dumps(row)}\n" t2 = datetime.utcnow() duration = (t2 - t1).total_seconds() debug_info['dump_json_line_duration'] += duration t1 = datetime.utcnow() f.write(line) t2 = datetime.utcnow() duration = (t2 - t1).total_seconds() debug_info['write_stage_file_duration'] += duration summary = { "count": len(rows), "ad": obj_info['ad'], "region": obj_info['region'], "filename_md5": filename_md5, "filename": obj_info['name'], "filesize": obj_info['size'], "date": str(dt), "status": status } summaries.append(summary) t1 = datetime.utcnow() subprocess.check_call(['gzip', local_stage_filename]) t2 = datetime.utcnow() duration = (t2 - t1).total_seconds() debug_info['gzip_output_duration'] = duration filesize = os.path.getsize(local_stage_gz_filename) t1 = datetime.utcnow() os_upload(save_os_client, local_stage_gz_filename, dest_namespace, dest_bucket, f"{stage_path}/{round:03}_{group_idx:07}.json.gz") if os.path.isfile(local_stage_filename): os.remove(local_stage_filename) if os.path.isfile(local_stage_gz_filename): os.remove(local_stage_gz_filename) t2 = datetime.utcnow() duration = (t2 - t1).total_seconds() debug_info['upload_duration'] += duration debug_info['upload_size'] += filesize debug_info['upload_file_count'] += 1 # upload summary as well local_summary_filename = os.path.join( current_dir, f"__{round}_{group_idx}-{pulse_bucket_name}-{dt}-summary__.json") with open(local_summary_filename, "wt") as f: for summary in summaries: line = f"{json.dumps(summary)}\n" f.write(line) t1 = datetime.utcnow() os_upload(save_os_client, local_summary_filename, dest_namespace, dest_bucket, f"{summary_path}/{round:03}_{group_idx:07}.json") t2 = datetime.utcnow() filesize = os.path.getsize(local_summary_filename) duration = (t2 - t1).total_seconds() debug_info['upload_duration'] += duration debug_info['upload_size'] += filesize debug_info['upload_file_count'] += 1 end_time = datetime.utcnow() debug_info['total_duration'] = (end_time - begin_time).total_seconds() debug_info['begin_time'] = begin_time.strftime("%Y-%m-%d %H:%M:%S") debug_info['end_time'] = end_time.strftime("%Y-%m-%d %H:%M:%S") debug_info['unhandled_count'] = len(unhandled_object_infos) return unhandled_object_infos, debug_info
def action_stage_objects(spark, args, sysops): # requires # args # action : should be "stage-objects" # stage : "beta" or "prod" # pulse_bucket_name : the PULSE bucket, e.g. "problems" # dt : string, the date for the data, e.g. "2020-06-01" # regions : dict, key is region name, value is ad list, like ['AD_1', 'AD_2', 'AD_3'] # watermark : control how big each group is (in term of total raw pulse object size) print("action_stage_objects: enter") stage = args['stage'] pulse_bucket_name = args['pulse_bucket_name'] dt = dt_str_to_date(args['dt']) regions = args['regions'] watermark = args['watermark'] max_group_time = timedelta(seconds=int(args['max_group_time'])) group_count = args['group_count'] debug_info = { "collect_object_list_duration": 0.0, # time to collect list of objects for the day "object_count": 0, # total object count "rounds": [], } delegation_token = get_delegation_token(spark) # get object list t1 = datetime.utcnow() obj_infos = [] for region, ads in regions.items(): region_obj_infos = pl.get_objects_for_day(dt, { 'bucket': pulse_bucket_name, 'region': region, 'ads': ads }, delegation_token) print(f"{len(region_obj_infos)} objects for {region}") obj_infos.extend(region_obj_infos) print(f"Overall {len(obj_infos)} objects") t2 = datetime.utcnow() duration = (t2 - t1).total_seconds() debug_info['collect_object_list_duration'] = duration debug_info['object_count'] = len(obj_infos) # during testing, let's chop it to speed up the testing # obj_infos = obj_infos[:1000] destination = { 'region': OHDDATA_REGION, 'namespace': OHDDATA_NAMESPACE, 'bucket': OHDDATA_BUCKETS[pulse_bucket_name][stage], } dest_region = destination['region'] dest_namespace = destination['namespace'] dest_bucket = destination['bucket'] # clean the staging area os_client = dfapp_get_os_client(dest_region, delegation_token) os_delete_objects(os_client, dest_namespace, dest_bucket, f"__stage__/{dt}/raw/") os_delete_objects(os_client, dest_namespace, dest_bucket, f"__stage__/{dt}/summary/") current_object_infos = obj_infos worker_task = make_task(task_stage_objects, sysops) for round in range(0, 10): # no more than 10 rounds unhandled_object_infos, round_debug_info = stage_json_objects_round( spark, worker_task, round, dt, destination, pulse_bucket_name, current_object_infos, watermark, max_group_time, group_count) debug_info['rounds'].append(round_debug_info) if len(unhandled_object_infos) == 0: break current_object_infos = unhandled_object_infos if len(unhandled_object_infos) > 0: raise Exception("Not all objects are handled!") print("action_stage_objects: exit") return debug_info