def clean_tasks_of_workflow(df): tasks = dict() raw_id_to_instances = dict() job_name = df.loc[0, "job_name"] workflow_id = mmh3.hash64(job_name)[1] invalid_task_raw_ids = set() # group by task name # - count number of instances # - compare with row.instance_num # Check to inspect if the data is noisy # def check(pdf): # a = pdf["instance_name"].nunique() # b = pdf["instance_name"].astype(np.int64).min() # c = pdf["instance_name"].astype(np.int64).max() # d = pdf["instance_num"].min() # e = pdf["instance_num"].max() # f = pdf["instance_name"].count() # if d != e or b < 0 or c >= e or a != d or a != f: # print("Noisy data! {}, {}, {}, {}, {}, {}".format(a, b, c, d, e, f)) # # df.groupby("task_name").apply(check) for row in df.itertuples(index=False): if None in row: print(row, flush=True) task_name = row.task_name instance_name = str(row.instance_name) memory_requested = row.plan_mem resources_requested = row.plan_cpu resource_id = row.machine_id splits = task_name.split("_") if splits[0] == "task": cleaned_task_name = splits[1] task_type = "bag" raw_parents = [] else: cleaned_task_name = splits[0][1:] task_type = str(splits[0][0]) raw_parents = [x for x in splits[1:] if x.isdigit()] if resource_id is None: resource_id = -1 else: resource_id = mmh3.hash64(row.machine_id)[1] if row.end_time is None or math.isnan(row.end_time): invalid_task_raw_ids.add(cleaned_task_name) continue if row.start_time is None or math.isnan(row.start_time): invalid_task_raw_ids.add(cleaned_task_name) continue if memory_requested is None or math.isnan(memory_requested): memory_requested = -1 if resources_requested is None or math.isnan(resources_requested): avg_cpu = row.cpu_avg if avg_cpu is None or math.isnan(avg_cpu): invalid_task_raw_ids.add(cleaned_task_name) continue else: resources_requested = avg_cpu this_task_id = mmh3.hash64(job_name + "@" + cleaned_task_name + "@" + instance_name)[1] if cleaned_task_name not in raw_id_to_instances: raw_id_to_instances[cleaned_task_name] = row.instance_num if row.instance_num > 10: # Create parent and child tasks raw_parent_id = cleaned_task_name + "_p" parent_task_id = mmh3.hash64(job_name + "@" + raw_parent_id + "@" + "0")[1] if parent_task_id not in tasks: tasks[parent_task_id] = Task( id=parent_task_id, type="dummy", submission_site=0, runtime=0, ts_submit=row.start_time * 1000, # We convert time from seconds to milliseconds. resource_amount_requested=1, parents=raw_parents, workflow_id=workflow_id, wait_time=0, resource_type='core', resource=-1, memory_requested=-1) raw_id_to_instances[raw_parent_id] = 1 raw_child_id = cleaned_task_name + "_c" child_task_id = mmh3.hash64(job_name + "@" + raw_child_id + "@" + "0")[1] if child_task_id not in tasks: tasks[child_task_id] = Task( id=child_task_id, type="dummy", submission_site=0, runtime=0, ts_submit=row.start_time * 1000, # We convert time from seconds to milliseconds. resource_amount_requested=1, parents=[cleaned_task_name], workflow_id=workflow_id, wait_time=0, resource_type='core', resource=-1, memory_requested=-1, params="child") raw_id_to_instances[raw_child_id] = 1 raw_parents = [raw_parent_id] this_task = Task( id=this_task_id, type=task_type, submission_site=0, runtime=(row.end_time - row.start_time) * 1000, ts_submit=row.start_time * 1000, # We convert time from seconds to milliseconds. resource_amount_requested=resources_requested, parents=raw_parents, workflow_id=workflow_id, params=task_name + " $ " + instance_name + " $ " + str(row.instance_num) + " $ " + job_name, wait_time=0, resource_type='core', resource=resource_id, memory_requested=memory_requested) tasks[this_task_id] = this_task for task_id, task in tasks.items(): task.parents = [ p for p in task.parents if p not in invalid_task_raw_ids ] parents = [] for raw_parent_id in task.parents: # If previous wave has a child and this task is not that child. # refer to the child instead of the wave. if raw_parent_id + "_c" in raw_id_to_instances and task.params is not "child": raw_parent_id = raw_parent_id + "_c" # We might hit an edge case where a parent was not recorded by the system of Alibaba # (e.g. bug or the tracing stopped) if raw_parent_id not in raw_id_to_instances: continue parent_instances = raw_id_to_instances[raw_parent_id] proper_parent_ids = [] for x in range(parent_instances): # Alibaba tasks specify instance_nums, however these tasks may not necesarrily be in the data # So we need to check if they are actually encountered. hash = mmh3.hash64(job_name + "@" + raw_parent_id + "@" + str(x))[1] if hash in tasks: proper_parent_ids.append(hash) parents.extend(proper_parent_ids) for proper_id in proper_parent_ids: tasks[proper_id].children.add(task_id) # task.params = None task.parents = parents # ze_best = pd.concat(pandas_dataframes) parquet_dicts = [task.get_parquet_dict() for task in tasks.values()] if len(tasks) > 0: ret = pd.DataFrame(parquet_dicts) else: # If no task was valid, return an empty DF with the columns set. Otherwise Spark goes boom. ret = pd.DataFrame(columns=Task.get_parquet_meta_dict().keys()) return ret
def validate_task_fields(self, task_pdf): for field_name, column_type in Task.get_parquet_meta_dict().items(): self.validate_field(task_pdf, field_name)