def clean_tasks_of_workflow(df):
        tasks = dict()
        raw_id_to_instances = dict()

        job_name = df.loc[0, "job_name"]
        workflow_id = mmh3.hash64(job_name)[1]

        invalid_task_raw_ids = set()

        # group by task name
        # - count number of instances
        # - compare with row.instance_num

        # Check to inspect if the data is noisy
        # def check(pdf):
        #     a = pdf["instance_name"].nunique()
        #     b = pdf["instance_name"].astype(np.int64).min()
        #     c = pdf["instance_name"].astype(np.int64).max()
        #     d = pdf["instance_num"].min()
        #     e = pdf["instance_num"].max()
        #     f = pdf["instance_name"].count()
        #     if d != e or b < 0 or c >= e or a != d or a != f:
        #         print("Noisy data! {}, {}, {}, {}, {}, {}".format(a, b, c, d, e, f))
        #
        # df.groupby("task_name").apply(check)

        for row in df.itertuples(index=False):
            if None in row:
                print(row, flush=True)
            task_name = row.task_name
            instance_name = str(row.instance_name)
            memory_requested = row.plan_mem
            resources_requested = row.plan_cpu
            resource_id = row.machine_id

            splits = task_name.split("_")

            if splits[0] == "task":
                cleaned_task_name = splits[1]
                task_type = "bag"
                raw_parents = []
            else:
                cleaned_task_name = splits[0][1:]
                task_type = str(splits[0][0])
                raw_parents = [x for x in splits[1:] if x.isdigit()]

            if resource_id is None:
                resource_id = -1
            else:
                resource_id = mmh3.hash64(row.machine_id)[1]

            if row.end_time is None or math.isnan(row.end_time):
                invalid_task_raw_ids.add(cleaned_task_name)
                continue

            if row.start_time is None or math.isnan(row.start_time):
                invalid_task_raw_ids.add(cleaned_task_name)
                continue

            if memory_requested is None or math.isnan(memory_requested):
                memory_requested = -1

            if resources_requested is None or math.isnan(resources_requested):
                avg_cpu = row.cpu_avg
                if avg_cpu is None or math.isnan(avg_cpu):
                    invalid_task_raw_ids.add(cleaned_task_name)
                    continue
                else:
                    resources_requested = avg_cpu

            this_task_id = mmh3.hash64(job_name + "@" + cleaned_task_name +
                                       "@" + instance_name)[1]

            if cleaned_task_name not in raw_id_to_instances:
                raw_id_to_instances[cleaned_task_name] = row.instance_num

            if row.instance_num > 10:
                # Create parent and child tasks
                raw_parent_id = cleaned_task_name + "_p"
                parent_task_id = mmh3.hash64(job_name + "@" + raw_parent_id +
                                             "@" + "0")[1]
                if parent_task_id not in tasks:
                    tasks[parent_task_id] = Task(
                        id=parent_task_id,
                        type="dummy",
                        submission_site=0,
                        runtime=0,
                        ts_submit=row.start_time * 1000,
                        # We convert time from seconds to milliseconds.
                        resource_amount_requested=1,
                        parents=raw_parents,
                        workflow_id=workflow_id,
                        wait_time=0,
                        resource_type='core',
                        resource=-1,
                        memory_requested=-1)
                    raw_id_to_instances[raw_parent_id] = 1

                raw_child_id = cleaned_task_name + "_c"
                child_task_id = mmh3.hash64(job_name + "@" + raw_child_id +
                                            "@" + "0")[1]
                if child_task_id not in tasks:
                    tasks[child_task_id] = Task(
                        id=child_task_id,
                        type="dummy",
                        submission_site=0,
                        runtime=0,
                        ts_submit=row.start_time * 1000,
                        # We convert time from seconds to milliseconds.
                        resource_amount_requested=1,
                        parents=[cleaned_task_name],
                        workflow_id=workflow_id,
                        wait_time=0,
                        resource_type='core',
                        resource=-1,
                        memory_requested=-1,
                        params="child")
                    raw_id_to_instances[raw_child_id] = 1

                raw_parents = [raw_parent_id]

            this_task = Task(
                id=this_task_id,
                type=task_type,
                submission_site=0,
                runtime=(row.end_time - row.start_time) * 1000,
                ts_submit=row.start_time *
                1000,  # We convert time from seconds to milliseconds.
                resource_amount_requested=resources_requested,
                parents=raw_parents,
                workflow_id=workflow_id,
                params=task_name + " $ " + instance_name + " $ " +
                str(row.instance_num) + " $ " + job_name,
                wait_time=0,
                resource_type='core',
                resource=resource_id,
                memory_requested=memory_requested)

            tasks[this_task_id] = this_task

        for task_id, task in tasks.items():
            task.parents = [
                p for p in task.parents if p not in invalid_task_raw_ids
            ]
            parents = []
            for raw_parent_id in task.parents:
                # If previous wave has a child and this task is not that child.
                # refer to the child instead of the wave.
                if raw_parent_id + "_c" in raw_id_to_instances and task.params is not "child":
                    raw_parent_id = raw_parent_id + "_c"

                # We might hit an edge case where a parent was not recorded by the system of Alibaba
                # (e.g. bug or the tracing stopped)
                if raw_parent_id not in raw_id_to_instances:
                    continue

                parent_instances = raw_id_to_instances[raw_parent_id]

                proper_parent_ids = []
                for x in range(parent_instances):
                    # Alibaba tasks specify instance_nums, however these tasks may not necesarrily be in the data
                    # So we need to check if they are actually encountered.
                    hash = mmh3.hash64(job_name + "@" + raw_parent_id + "@" +
                                       str(x))[1]
                    if hash in tasks:
                        proper_parent_ids.append(hash)

                parents.extend(proper_parent_ids)
                for proper_id in proper_parent_ids:
                    tasks[proper_id].children.add(task_id)

            # task.params = None
            task.parents = parents

        # ze_best = pd.concat(pandas_dataframes)
        parquet_dicts = [task.get_parquet_dict() for task in tasks.values()]
        if len(tasks) > 0:
            ret = pd.DataFrame(parquet_dicts)
        else:  # If no task was valid, return an empty DF with the columns set. Otherwise Spark goes boom.
            ret = pd.DataFrame(columns=Task.get_parquet_meta_dict().keys())
        return ret
 def validate_task_fields(self, task_pdf):
     for field_name, column_type in Task.get_parquet_meta_dict().items():
         self.validate_field(task_pdf, field_name)