예제 #1
0
def get_objects_for_day(dt, source, delegation_token):
    # dt                            - date
    # source                        - specify the source
    #     region                    - the region of the pulse data
    #     bucket                    - the bucket name, e.g. "problems"
    #     ads                       - list of ADs in the region, such as ['AD_1', 'AD_2', 'AD_3']
    # delegation_token              - the oci delegation_token
    src_region = source['region']
    src_bucket = source['bucket']
    src_ads = source['ads']
    os_client = dfapp_get_os_client(src_region, delegation_token)

    obj_infos = []
    for ad in src_ads:
        prefix = f"{src_region}_{ad}/{src_bucket}/{dt.year}/{dt.month}/{dt.day}/"
        for record in oci.pagination.list_call_get_all_results_generator(
                os_client.list_objects,
                'record',
                PULSE_NAMESPACE,
                src_bucket,
                prefix=prefix,
                fields="name, size",
        ):
            obj_infos.append({
                "name": record.name,
                "size": record.size,
                'region': src_region,
                'ad': ad
            })

    return obj_infos
예제 #2
0
def main(spark, args):
    # dummy code to load dataframe
    Student = Row("id", "name")
    df = spark.createDataFrame(
        [Student(1, 'foo'),
         Student(3, 'bar'),
         Student(2, 'tar')])
    df.show()

    # list objects for problems
    client = dfapp_get_os_client(
        spark, "https://objectstorage.us-ashburn-1.oraclecloud.com")

    # print("oci is imported")
    # conf = spark.sparkContext.getConf()
    # token_path = conf.get("spark.hadoop.fs.oci.client.auth.delegationTokenPath")

    # # read in token
    # with open(token_path) as fd:
    #     delegation_token = fd.read()
    #     print("delegation_token = {}".format(delegation_token))

    # signer = oci.auth.signers.InstancePrincipalsDelegationTokenSigner(delegation_token=delegation_token)
    # print("signer created")

    # client = oci.object_storage.ObjectStorageClient(
    #     {}, signer=signer, service_endpoint='https://objectstorage.us-ashburn-1.oraclecloud.com', timeout=60.0
    # )
    # print("object storage client created")

    r = client.list_objects("oci-pulse-prod", "problems")
    for object_sumary in r.data.objects:
        print(">> {}".format(object_sumary.name))
    print("list object done")
예제 #3
0
def generate_parquet(spark, server_channel, dt, source, destination,
                     partition_count, application_id, application_args):
    # spark             - spark session
    # dt                - date, for which day we are processing?
    # source            - specify the source
    #     bucket        - the pulse bucket we are handling
    # destination
    #     region          - the region for saving the pulse data
    #     bucket          - the bucket for saving the pulse data
    #     namespace       - the namespace for saving the pulse data
    source_bucket = source['bucket']
    dest_region = destination['region']
    dest_namespace = destination['namespace']
    dest_bucket = destination['bucket']
    os_client = dfapp_get_os_client(dest_region, get_delegation_token(spark))

    os_delete_objects(os_client, dest_namespace, dest_bucket, f"{dt}/")

    rpl = RawPulseDataLocations(destination)
    # save pulse data
    write_location = rpl.url_from_path(f"{dt}/raw.parquet")
    df = spark.read.json(rpl.url_from_path(f"__stage__/{dt}/raw/*"))
    df.coalesce(partition_count).write.partitionBy("_sys_region").parquet(
        write_location)
    data_time = datetime.combine(dt, datetime.min.time())
    df = spark.read.parquet(write_location)

    # register it to data catalog
    dcc = DataCatalogClientProxy(server_channel)
    loader = Loader(dcc=dcc)

    sample_data = get_dataframe_sample_data(df)
    dsi = loader.register_asset(
        spark,
        f"daily_pulse_{source_bucket}_raw:1.0:1:/{dt}",
        'hwd',
        'parquet',
        write_location,
        df.count(),
        df.schema.jsonValue(),
        sample_data=sample_data,
        data_time=data_time,
        application_id=application_id,
        application_args=json.dumps(application_args),
    )

    # save summary
    df = spark.read.json(rpl.url_from_path(f"__stage__/{dt}/summary/*"))
    # summary file is small, so it is ok to have 1 partition per region
    df.coalesce(1).write.partitionBy("region").parquet(
        rpl.url_from_path(f"{dt}/summary.parquet"))

    os_delete_objects(os_client, dest_namespace, dest_bucket,
                      f"__stage__/{dt}/raw/")
    os_delete_objects(os_client, dest_namespace, dest_bucket,
                      f"__stage__/{dt}/summary/")
예제 #4
0
def get_os_client_ex(spark, region):
    from oci_core import dfapp_get_os_client, get_delegation_token, get_os_client
    if USE_INSTANCE_PRINCIPLE:
        delegation_token = get_delegation_token(spark)
        os_client = dfapp_get_os_client(region, delegation_token)
    else:
        with tempfile.NamedTemporaryFile(mode='w+t', delete=False) as key_f:
            key_f.write(OCI_KEY)
        _oci_config = dict(OCI_CONFIG)
        _oci_config['key_file'] = key_f.name
        os_client = get_os_client(None, config=_oci_config)
    return os_client
예제 #5
0
def stage_objects(group_idx, group_creation_time, round, max_group_time,
                  source_region, total_size, dt, obj_infos, delegation_token,
                  destination, pulse_bucket_name):
    # group_idx         - group index, each task has a group index
    # group_creation_time - when the group is created, datetime
    # round             - the round number, integer
    # max_group_time    - max time a group is allowed, timedelta
    # source_region     - the pulse region
    # total_size        - total raw pulse object size in bytes
    # dt                - date, for which day we are processing?
    # obj_infos         - list of dict which contain name, size, region and ad (for PULSE object names, object size, region and ad)
    # delegation_token  - oci delegation token
    # destination
    #     region          - the region for saving the pulse data
    #     bucket          - the bucket for saving the pulse data
    #     namespace       - the namespace for saving the pulse data
    # pulse_bucket_name -  Pulse bucket name

    current_dir = os.getcwd()

    begin_time = datetime.utcnow()

    debug_info = {
        "download_file_count": 0,  # number of gzipped pulse raw file
        "download_duration": 0.0,  # time to download gzipped pulse raw file
        "download_size":
        0,  # total bytes for gzipped pulse raw file downloaded
        "total_size":
        total_size,  # total bytes for gzipped pulse raw file (form list objects api)
        # should match download_size
        "unzip_file_count":
        0,  # number of time we unzip gzipped raw pulse file
        "unzip_duration": 0.0,  # time to unzip gzipped raw pulse file
        "load_json_count":
        0,  # number of json we try to load from unzipped pulse file
        "load_json_duration": 0.0,  # time to load json
        "dump_json_line_duration":
        0.0,  # the time to dump a json object to a string in a line for the output
        "write_stage_file_duration": 0.0,  # the time to write to staging file
        "gzip_output_duration": 0.0,  # gzip output json for pulse data
        "upload_duration":
        0.0,  # time for upload to object storage, both raw json and summary json
        "upload_size": 0,  # total bytes for upload
        "upload_file_count": 0,  # number of files upload
        "total_duration": 0.0,  # total duration for this task
        "unhandled_count": 0,  # number of object_info that is skipped
    }

    rpl = RawPulseDataLocations(destination)

    stage_path = f"__stage__/{dt}/raw"
    summary_path = f"__stage__/{dt}/summary"

    dest_region = destination['region']
    dest_namespace = destination['namespace']
    dest_bucket = destination['bucket']

    save_os_client = dfapp_get_os_client(dest_region, delegation_token)
    pulse_os_client = dfapp_get_os_client(source_region, delegation_token)

    summaries = []

    local_stage_filename = os.path.join(
        current_dir,
        f"__{round}_{group_idx}-{pulse_bucket_name}-{dt}-stage__.json")
    local_stage_gz_filename = f"{local_stage_filename}.gz"

    if os.path.isfile(local_stage_filename):
        os.remove(local_stage_filename)
    if os.path.isfile(local_stage_gz_filename):
        os.remove(local_stage_gz_filename)

    unhandled_object_infos = []
    with open(local_stage_filename, "a+t") as f:
        for obj_info_idx, obj_info in enumerate(obj_infos):

            now = datetime.utcnow()
            if (now - group_creation_time) >= max_group_time:
                unhandled_object_infos = obj_infos[obj_info_idx:]
                break

            object_name = obj_info['name']
            filename_md5 = hashlib.md5(object_name.encode('utf-8')).hexdigest()
            rows, status = object_to_json(pulse_os_client, pulse_bucket_name,
                                          object_name, debug_info)
            for idx, row in enumerate(rows):
                row['_sys_region'] = obj_info['region']
                row['_sys_filename_md5'] = filename_md5
                row['_sys_idx'] = idx

                t1 = datetime.utcnow()
                line = f"{json.dumps(row)}\n"
                t2 = datetime.utcnow()
                duration = (t2 - t1).total_seconds()
                debug_info['dump_json_line_duration'] += duration

                t1 = datetime.utcnow()
                f.write(line)
                t2 = datetime.utcnow()
                duration = (t2 - t1).total_seconds()
                debug_info['write_stage_file_duration'] += duration

            summary = {
                "count": len(rows),
                "ad": obj_info['ad'],
                "region": obj_info['region'],
                "filename_md5": filename_md5,
                "filename": obj_info['name'],
                "filesize": obj_info['size'],
                "date": str(dt),
                "status": status
            }
            summaries.append(summary)

    t1 = datetime.utcnow()
    subprocess.check_call(['gzip', local_stage_filename])
    t2 = datetime.utcnow()
    duration = (t2 - t1).total_seconds()
    debug_info['gzip_output_duration'] = duration

    filesize = os.path.getsize(local_stage_gz_filename)
    t1 = datetime.utcnow()
    os_upload(save_os_client, local_stage_gz_filename, dest_namespace,
              dest_bucket, f"{stage_path}/{round:03}_{group_idx:07}.json.gz")
    if os.path.isfile(local_stage_filename):
        os.remove(local_stage_filename)
    if os.path.isfile(local_stage_gz_filename):
        os.remove(local_stage_gz_filename)
    t2 = datetime.utcnow()
    duration = (t2 - t1).total_seconds()
    debug_info['upload_duration'] += duration
    debug_info['upload_size'] += filesize
    debug_info['upload_file_count'] += 1

    # upload summary as well
    local_summary_filename = os.path.join(
        current_dir,
        f"__{round}_{group_idx}-{pulse_bucket_name}-{dt}-summary__.json")
    with open(local_summary_filename, "wt") as f:
        for summary in summaries:
            line = f"{json.dumps(summary)}\n"
            f.write(line)
    t1 = datetime.utcnow()
    os_upload(save_os_client, local_summary_filename, dest_namespace,
              dest_bucket, f"{summary_path}/{round:03}_{group_idx:07}.json")
    t2 = datetime.utcnow()
    filesize = os.path.getsize(local_summary_filename)
    duration = (t2 - t1).total_seconds()
    debug_info['upload_duration'] += duration
    debug_info['upload_size'] += filesize
    debug_info['upload_file_count'] += 1

    end_time = datetime.utcnow()
    debug_info['total_duration'] = (end_time - begin_time).total_seconds()
    debug_info['begin_time'] = begin_time.strftime("%Y-%m-%d %H:%M:%S")
    debug_info['end_time'] = end_time.strftime("%Y-%m-%d %H:%M:%S")
    debug_info['unhandled_count'] = len(unhandled_object_infos)
    return unhandled_object_infos, debug_info
예제 #6
0
def action_stage_objects(spark, args, sysops):
    # requires
    # args
    #     action            : should be "stage-objects"
    #     stage             : "beta" or "prod"
    #     pulse_bucket_name : the PULSE bucket, e.g. "problems"
    #     dt                : string, the date for the data, e.g. "2020-06-01"
    #     regions           : dict, key is region name, value is ad list, like ['AD_1', 'AD_2', 'AD_3']
    #     watermark         : control how big each group is (in term of total raw pulse object size)
    print("action_stage_objects: enter")

    stage = args['stage']
    pulse_bucket_name = args['pulse_bucket_name']
    dt = dt_str_to_date(args['dt'])
    regions = args['regions']
    watermark = args['watermark']
    max_group_time = timedelta(seconds=int(args['max_group_time']))
    group_count = args['group_count']

    debug_info = {
        "collect_object_list_duration":
        0.0,  # time to collect list of objects for the day
        "object_count": 0,  # total object count
        "rounds": [],
    }

    delegation_token = get_delegation_token(spark)

    # get object list
    t1 = datetime.utcnow()
    obj_infos = []
    for region, ads in regions.items():
        region_obj_infos = pl.get_objects_for_day(dt, {
            'bucket': pulse_bucket_name,
            'region': region,
            'ads': ads
        }, delegation_token)
        print(f"{len(region_obj_infos)} objects for {region}")
        obj_infos.extend(region_obj_infos)
    print(f"Overall {len(obj_infos)} objects")
    t2 = datetime.utcnow()
    duration = (t2 - t1).total_seconds()
    debug_info['collect_object_list_duration'] = duration
    debug_info['object_count'] = len(obj_infos)

    # during testing, let's chop it to  speed up the testing
    # obj_infos = obj_infos[:1000]

    destination = {
        'region': OHDDATA_REGION,
        'namespace': OHDDATA_NAMESPACE,
        'bucket': OHDDATA_BUCKETS[pulse_bucket_name][stage],
    }
    dest_region = destination['region']
    dest_namespace = destination['namespace']
    dest_bucket = destination['bucket']

    # clean the staging area
    os_client = dfapp_get_os_client(dest_region, delegation_token)
    os_delete_objects(os_client, dest_namespace, dest_bucket,
                      f"__stage__/{dt}/raw/")
    os_delete_objects(os_client, dest_namespace, dest_bucket,
                      f"__stage__/{dt}/summary/")

    current_object_infos = obj_infos
    worker_task = make_task(task_stage_objects, sysops)
    for round in range(0, 10):  # no more than 10 rounds
        unhandled_object_infos, round_debug_info = stage_json_objects_round(
            spark, worker_task, round, dt, destination, pulse_bucket_name,
            current_object_infos, watermark, max_group_time, group_count)
        debug_info['rounds'].append(round_debug_info)

        if len(unhandled_object_infos) == 0:
            break
        current_object_infos = unhandled_object_infos

    if len(unhandled_object_infos) > 0:
        raise Exception("Not all objects are handled!")

    print("action_stage_objects: exit")

    return debug_info