def container_to_task(df): row = df.iloc[0, :] start_time = df["time_stamp"].min() * 1000 stop_time = df["time_stamp"].max() * 1000 task_id = mmh3.hash64(row["container_id"])[1] workflow_id = mmh3.hash64(row["app_du"])[1] task = Task( id=task_id, type="long running", parents=[], ts_submit= start_time, # We convert time from seconds to milliseconds. submission_site=0, runtime=(start_time - stop_time), resource_amount_requested=row["cpu_request"], memory_requested=row["mem_size"], workflow_id=workflow_id, wait_time=0, resource=mmh3.hash64(row["machine_id"])[1]) return pd.DataFrame([task.get_parquet_dict()])
def parse_lanl_file(lanl_file): task_list = [] task_by_id = {} df = pd.read_csv(lanl_file, parse_dates=["submission_time", "start_date", "end_date"], infer_datetime_format=True) task_df = df[df['object_event'] == "JOBEND"] earliest_date = df['submission_time'].min() latest_date = df["end_date"].max() for index, row in task_df.iterrows(): id = str( mmh3.hash64("task:{}".format(str(row["object_id"]).strip()))[0]) # Task time fields submission_time_task = row["submission_time"] start_time_task = row["start_date"] end_time_task = row["end_date"] # Task cpu consumption fields num_nodes = int(row["nodes_requested"]) num_cpus_per_node = row["dedicated_processors_per_task"] # Task dependency fields extension_string = str(row["resource_manager_extension_string"]) # Find dependencies match = re.search('DEPEND=([\w,:.]+);?', extension_string) if not match: dependencies = set() else: dependencies = match.group(1) dependencies = set( str(mmh3.hash64("task:{}".format(str(dep).strip()))[0]) for dep in dependencies.split("&")) task_wait_time = int( (start_time_task - submission_time_task).total_seconds() * 1000) task_runtime = int( (end_time_task - start_time_task).total_seconds() * 1000) task = Task(id, "Atomic", start_time_task, -1, task_runtime, num_nodes * num_cpus_per_node, dependencies, -1, task_wait_time, resource_type="core", resource=-1) # Convert the ts_submit to seconds instead of a datetime string EPOCH = datetime(1970, 1, 1, tzinfo=task.ts_submit.tzinfo) task.ts_submit = int((task.ts_submit - EPOCH).total_seconds() * 1000) # Set the wallclock limit task.nfrs["runtime_limit"] = row["wallclock_limit"] task_by_id[id] = task task_list.append(task) min_ts_submit = min(task.ts_submit for task in task_list) # For every task, add itself the the children of its parents for task in task_list: task.ts_submit -= min_ts_submit # Make sure the first task in the trace starts at 0 invalid_parents = set() for parent_id in task.parents: # Chop of the prepend of *: (e.g. jobsuccess:) actual_parent_id = parent_id[str(parent_id).find(":") + 1:] if actual_parent_id in task_by_id: # If this doesn't fire, the task probably failed, we filter those out. parent = task_by_id[actual_parent_id] parent.children.add(task.id) else: invalid_parents.add(parent_id) # Remove invalid parents if invalid_parents: task.parents -= invalid_parents # Find start tasks and assign workflow ids workflow_id = 0 for task in task_list: if task.workflow_id == -1: root_parents = task.get_root_parents(task_by_id) if root_parents: # If there are start tasks, propogate from them for root_parent_id in root_parents: actual_root_id = root_parent_id[str(root_parent_id). find(":") + 1:] task_by_id[actual_root_id].set_workflow_id_propagating( task_by_id, workflow_id) else: # Else it's a single job so just set the property directly task.workflow_id = workflow_id workflow_id += 1 # Now that every thing has been computed, we write the tasks to parquet files os.makedirs(os.path.join(TARGET_DIR, Task.output_path()), exist_ok=True) task_df = pd.DataFrame([task.get_parquet_dict() for task in task_list]) # Make sure the first workflow is submitted at time 0 min_submit_time = task_df["ts_submit"].min() task_df = task_df.assign( ts_submit=lambda x: x['ts_submit'] - min_submit_time) task_df.to_parquet(os.path.join(TARGET_DIR, Task.output_path(), "part.0.parquet"), engine="pyarrow") workflows = dict() # Based on workflow ids, constuct the workflow objects for task in task_list: if task.workflow_id in workflows: workflow = workflows[task.workflow_id] else: workflow = Workflow(workflow_id, None, [], "", "Scientific", "Uncategorized", "Uncategorized") workflows[task.workflow_id] = workflow if not workflow.ts_submit: workflow.ts_submit = task.ts_submit else: workflow.ts_submit = min(workflow.ts_submit, task.ts_submit) workflow.tasks.append(task) workflow.task_count = len(workflow.tasks) for w in workflows.values(): w.compute_critical_path(strip_colon=True) os.makedirs(os.path.join(TARGET_DIR, Workflow.output_path()), exist_ok=True) workflow_df = pd.DataFrame( [workflow.get_parquet_dict() for workflow in workflows.values()]) workflow_df.to_parquet(os.path.join(TARGET_DIR, Workflow.output_path(), "part.0.parquet"), engine="pyarrow") # Write a json dict with the workload properties json_dict = Workload.get_json_dict_from_pandas_task_dataframe( task_df, domain="Engineering", start_date=str(earliest_date), end_date=str(latest_date), authors=[ "George Amvrosiadis", "Jun Woo Park", "Gregory R. Ganger", "Garth A. Gibson", "Elisabeth Baseman", "Nathan DeBardeleben" ], workload_description= "This workload was published by Amvrosiadis et al. as part of their ATC 2018 paper titled \"On the diversity of cluster workloads and its impact on research results\". It is the Trinity trace from the Los Almos National Laboratory." ) os.makedirs(os.path.join(TARGET_DIR, Workload.output_path()), exist_ok=True) with open( os.path.join(TARGET_DIR, Workload.output_path(), "generic_information.json"), "w") as file: # Need this on 32-bit python. def default(o): if isinstance(o, np.int64): return int(o) file.write(json.dumps(json_dict, default=default))