Python Task.get_parquet_dict 예제들

프로그래밍 언어: Python

네임스페이스/패키지 이름: objects.task

클래스/타입: Task

메소드/함수: get_parquet_dict

hotexamples.com에서의 예제들: 2

Python Task.get_parquet_dict - 2개의 예제가 발견되었습니다. 이것들은 오픈소스 프로젝트에서 추출된 Python의 objects.task.Task.get_parquet_dict에 대한 실세계 최고 등급의 예제들입니다. 예제들을 평가하여 예제의 품질 향상에 도움을 줄 수 있습니다.

자주 사용되는 메소드들

보기 숨기기

Task(15)

output_path(13)

children(3)

get_parquet_dict(2)

get_parquet_meta_dict(2)

get_pyarrow_schema(2)

get_root_parents(1)

get_spark_type(1)

ts_submit(1)

workflow_id(1)

예제 #1

파일 보기

파일: alibaba2018_to_parquet.py 프로젝트: atlarge-research/wta-tools

    def container_to_task(df):
        row = df.iloc[0, :]
        start_time = df["time_stamp"].min() * 1000
        stop_time = df["time_stamp"].max() * 1000
        task_id = mmh3.hash64(row["container_id"])[1]
        workflow_id = mmh3.hash64(row["app_du"])[1]

        task = Task(
            id=task_id,
            type="long running",
            parents=[],
            ts_submit=
            start_time,  # We convert time from seconds to milliseconds.
            submission_site=0,
            runtime=(start_time - stop_time),
            resource_amount_requested=row["cpu_request"],
            memory_requested=row["mem_size"],
            workflow_id=workflow_id,
            wait_time=0,
            resource=mmh3.hash64(row["machine_id"])[1])

        return pd.DataFrame([task.get_parquet_dict()])

예제 #2

파일 보기

def parse_lanl_file(lanl_file):
    task_list = []
    task_by_id = {}

    df = pd.read_csv(lanl_file,
                     parse_dates=["submission_time", "start_date", "end_date"],
                     infer_datetime_format=True)
    task_df = df[df['object_event'] == "JOBEND"]

    earliest_date = df['submission_time'].min()
    latest_date = df["end_date"].max()

    for index, row in task_df.iterrows():
        id = str(
            mmh3.hash64("task:{}".format(str(row["object_id"]).strip()))[0])

        # Task time fields
        submission_time_task = row["submission_time"]
        start_time_task = row["start_date"]
        end_time_task = row["end_date"]

        # Task cpu consumption fields
        num_nodes = int(row["nodes_requested"])
        num_cpus_per_node = row["dedicated_processors_per_task"]

        # Task dependency fields
        extension_string = str(row["resource_manager_extension_string"])
        # Find dependencies
        match = re.search('DEPEND=([\w,:.]+);?', extension_string)
        if not match:
            dependencies = set()
        else:
            dependencies = match.group(1)
            dependencies = set(
                str(mmh3.hash64("task:{}".format(str(dep).strip()))[0])
                for dep in dependencies.split("&"))

        task_wait_time = int(
            (start_time_task - submission_time_task).total_seconds() * 1000)
        task_runtime = int(
            (end_time_task - start_time_task).total_seconds() * 1000)

        task = Task(id,
                    "Atomic",
                    start_time_task,
                    -1,
                    task_runtime,
                    num_nodes * num_cpus_per_node,
                    dependencies,
                    -1,
                    task_wait_time,
                    resource_type="core",
                    resource=-1)

        # Convert the ts_submit to seconds instead of a datetime string
        EPOCH = datetime(1970, 1, 1, tzinfo=task.ts_submit.tzinfo)
        task.ts_submit = int((task.ts_submit - EPOCH).total_seconds() * 1000)

        # Set the wallclock limit
        task.nfrs["runtime_limit"] = row["wallclock_limit"]

        task_by_id[id] = task
        task_list.append(task)

    min_ts_submit = min(task.ts_submit for task in task_list)

    # For every task, add itself the the children of its parents
    for task in task_list:
        task.ts_submit -= min_ts_submit  # Make sure the first task in the trace starts at 0
        invalid_parents = set()
        for parent_id in task.parents:
            # Chop of the prepend of *: (e.g. jobsuccess:)
            actual_parent_id = parent_id[str(parent_id).find(":") + 1:]
            if actual_parent_id in task_by_id:  # If this doesn't fire, the task probably failed, we filter those out.
                parent = task_by_id[actual_parent_id]
                parent.children.add(task.id)
            else:
                invalid_parents.add(parent_id)

        # Remove invalid parents
        if invalid_parents:
            task.parents -= invalid_parents

    # Find start tasks and assign workflow ids
    workflow_id = 0
    for task in task_list:
        if task.workflow_id == -1:
            root_parents = task.get_root_parents(task_by_id)
            if root_parents:  # If there are start tasks, propogate from them
                for root_parent_id in root_parents:
                    actual_root_id = root_parent_id[str(root_parent_id).
                                                    find(":") + 1:]
                    task_by_id[actual_root_id].set_workflow_id_propagating(
                        task_by_id, workflow_id)
            else:  # Else it's a single job so just set the property directly
                task.workflow_id = workflow_id
            workflow_id += 1

    # Now that every thing has been computed, we write the tasks to parquet files
    os.makedirs(os.path.join(TARGET_DIR, Task.output_path()), exist_ok=True)
    task_df = pd.DataFrame([task.get_parquet_dict() for task in task_list])

    # Make sure the first workflow is submitted at time 0
    min_submit_time = task_df["ts_submit"].min()
    task_df = task_df.assign(
        ts_submit=lambda x: x['ts_submit'] - min_submit_time)

    task_df.to_parquet(os.path.join(TARGET_DIR, Task.output_path(),
                                    "part.0.parquet"),
                       engine="pyarrow")

    workflows = dict()
    # Based on workflow ids, constuct the workflow objects
    for task in task_list:
        if task.workflow_id in workflows:
            workflow = workflows[task.workflow_id]
        else:
            workflow = Workflow(workflow_id, None, [], "", "Scientific",
                                "Uncategorized", "Uncategorized")
            workflows[task.workflow_id] = workflow

        if not workflow.ts_submit:
            workflow.ts_submit = task.ts_submit
        else:
            workflow.ts_submit = min(workflow.ts_submit, task.ts_submit)

        workflow.tasks.append(task)
        workflow.task_count = len(workflow.tasks)

    for w in workflows.values():
        w.compute_critical_path(strip_colon=True)

    os.makedirs(os.path.join(TARGET_DIR, Workflow.output_path()),
                exist_ok=True)
    workflow_df = pd.DataFrame(
        [workflow.get_parquet_dict() for workflow in workflows.values()])
    workflow_df.to_parquet(os.path.join(TARGET_DIR, Workflow.output_path(),
                                        "part.0.parquet"),
                           engine="pyarrow")

    # Write a json dict with the workload properties
    json_dict = Workload.get_json_dict_from_pandas_task_dataframe(
        task_df,
        domain="Engineering",
        start_date=str(earliest_date),
        end_date=str(latest_date),
        authors=[
            "George Amvrosiadis", "Jun Woo Park", "Gregory R. Ganger",
            "Garth A. Gibson", "Elisabeth Baseman", "Nathan DeBardeleben"
        ],
        workload_description=
        "This workload was published by Amvrosiadis et al. as part of their ATC 2018 paper titled \"On the diversity of cluster workloads and its impact on research results\". It is the Trinity trace from the Los Almos National Laboratory."
    )

    os.makedirs(os.path.join(TARGET_DIR, Workload.output_path()),
                exist_ok=True)

    with open(
            os.path.join(TARGET_DIR, Workload.output_path(),
                         "generic_information.json"), "w") as file:
        # Need this on 32-bit python.
        def default(o):
            if isinstance(o, np.int64):
                return int(o)

        file.write(json.dumps(json_dict, default=default))