Exemplo n.º 1
0
 def generateMap(self, size, resources, spots, repartition, totalRes,
                 delta):
     totalSize = size[0] * size[1]
     cases = list(range(0, totalSize))
     hqX, hqY = settings.DEFAULT_HQ_POS
     cases.remove(hqX * hqY)
     hqX, hqY = settings.DEFAULT_TRANSMITTER_POS
     cases.remove(hqX * hqY)
     numpy.random.seed(self._seed)
     resList = []
     for i, res in enumerate(resources):
         # compute spot number
         spotNumber = self.getRandomDelta(totalSize * spots[i], delta)
         amountBySpot = self.getRandomDelta(
             totalRes * repartition[i],
             delta) / spotNumber  # uniform distribution
         print(amountBySpot)
         for j in range(0, spotNumber):
             position = numpy.random.choice(cases, 1)
             cases.remove(position)
             resList.append(
                 Resource(
                     (int(position / size[1]) + settings.BORDER_TILES_NUM,
                      int(position % size[1]) + settings.BORDER_TILES_NUM),
                     (1, 1), res, amountBySpot))
     return resList
Exemplo n.º 2
0
def parse_resources(c):
    resources = {}
    c.execute("SELECT * FROM host")
    for host_id, wf_id, site, hostname, ip, uname, total_memory in c.fetchall():
        details = {}
        details["ip"] = ip
        details["hostname"] = hostname
        resources[host_id] = Resource(host_id, site, -1, -1, total_memory, -1, -1, uname, details)
    return resources
 def machine_meta_to_resources(row):
     resource = Resource(
         id=mmh3.hash64(row["machine_id"])[1],
         type="cpu",
         num_resources=float(row["cpu_num"]),
         memory=row["mem_size"],
     )
     resource_dict = resource.get_json_dict()
     del resource_dict["events"]
     return SparkRow(**resource_dict)
Exemplo n.º 4
0
    def __init__(self, **attributes):
        for attr_name, attr_value in attributes.items():
            setattr(self, attr_name, attr_value)
        # self.nb_machine = nb_machine
        # self.nb_jobs = nb_jobs
        # self.problem = problem
        self.resource_list = []
        self.jobs_list = []

        # Create Resources
        for i in range(self.nb_machine):
            self.resource_list.append(Resource(i))
        # Create jobs
        for i in range(self.nb_jobs):
            self.jobs_list.append(Job(i, self.problem[i], self.resource_list))
Exemplo n.º 5
0
 def __init__(self, nom):
     self.data, self.optimum = loader(name=nom)
     self.nb_machine = self.data['nb_machine']
     self.nb_jobs = self.data['nb_jobs']
     self.problem = self.data['problem']
     self.nom = nom
     self.resource_list = []
     self.jobs_list = []
     self.makeSpan = -1
     self.criticalPath = []
     self.state = "Not Solved"
     # Create Resources
     for i in range(self.nb_machine):
         self.resource_list.append(Resource(i))
     # Create jobs
     for i in range(self.nb_jobs):
         self.jobs_list.append(Job(i, self.problem[i], self.resource_list, self))
Exemplo n.º 6
0
def parse_and_return_task_dataframe(file_path):
    global TARGET_DIR
    with open(file_path) as trace:
        json_data = json.load(trace)

        workflow = json_data['workflow']
        tasks = workflow['jobs']
        machines = workflow['machines']
        date = json_data['createdAt']

        # Convert the ts_submit to seconds instead of a datetime string
        task_date = dateparser.parse(date)
        EPOCH = datetime(1970, 1, 1, tzinfo=task_date.tzinfo)
        ts_submit = int((task_date - EPOCH).total_seconds() * 1000)

        resource_by_id = dict()

        for machine in machines:
            machine_id = mmh3.hash64("machine:{0}".format(machine['machine_code'].strip()))[0]
            machine = machine["machine"]
            num_cpus = machine['cpu']['count']
            details = {
                "cpu_vendor": machine['cpu']['vendor'],
                "architecture": machine['architecture']
            }
            memory_in_gb = int(machine['memory']) / float(1024 * 1024)
            res = Resource(machine_id, "cluster_node", num_cpus, machine['release'], memory_in_gb, -1, -1,
                           machine['system'], details)
            resource_by_id[machine_id] = res

        task_list = []
        task_state_list = []
        inputs_per_taskid = dict()
        outputs_per_taskid = dict()
        outputs_matched = dict()

        task_per_taskid = dict()

        input_file_data_per_task_id = dict()
        output_file_data_per_task_id = dict()
        for task in tasks:
            task_id = mmh3.hash64("task:{}".format(str(task['name']).strip()))[0]
            print(task_id)
            task_files = task['files'] if 'files' in task else []
            task_type = task['type']
            task_cores = task['cores'] if 'cores' in task else 1
            task_memory = task['memory'] if 'memory' in task else -1
            task_runtime = task['runtime'] * 1000 if 'runtime' in task else -1
            task_dependencies = [mmh3.hash64("task:{}".format(str(p).strip()))[0] for p in
                                 task['parents']]
            task_parameters = {"arguments": task['arguments']} if 'arguments' in task else {}
            task_machine = mmh3.hash64("machine:{0}".format(task['machine'].strip()))[0] if 'machine' in task else None
            task_resource = resource_by_id[task_machine].id if 'machine' in task else -1
            # Convert energy to Wh from KWh
            task_total_energy_consumption = float(task['energy']) * 1000 if 'energy' in task else -1

            t = Task(task_id, task_type, ts_submit, -1, task_runtime, task_cores, task_dependencies, 0, -1,
                     params=task_parameters, resource=task_resource, energy_consumption=task_total_energy_consumption,
                     resource_type="core")

            task_per_taskid[task_id] = t
            task_list.append(t)

            # Parse the data transfers
            for file_item in task_files:
                # Apparently not all traces were parsed into version 0.2 despite them being in the
                # folders for 0.2. To this end we need a check for the file name and size fields.
                file_name = file_item['name'] if 'name' in file_item else file_item['fileId']
                file_size = file_item['size'] if 'size' in file_item else -1

                # Store the incoming and outgoing data to this task in separate dicts
                if file_item['link'] == "input":
                    if task_id not in inputs_per_taskid:
                        inputs_per_taskid[task_id] = set()
                        input_file_data_per_task_id[task_id] = dict()
                        inputs_per_taskid[task_id].add(file_name)

                    try:
                        input_file_data_per_task_id[task_id][file_name] = file_size
                    except:
                        print(file_item)
                        exit(-1)

                elif file_item['link'] == "output":
                    if task_id not in outputs_per_taskid:
                        outputs_per_taskid[task_id] = set()
                        outputs_matched[task_id] = dict()
                        output_file_data_per_task_id[task_id] = dict()
                        output_file_data_per_task_id[task_id] = dict()

                    outputs_per_taskid[task_id].add(file_name)
                    outputs_matched[task_id][file_name] = False
                    output_file_data_per_task_id[task_id][file_name] = file_size

            # Create a task state for the entire duration with
            task_state = TaskState(ts_submit, ts_submit + task_runtime, 0, task_id, -1,
                                   canonical_memory_usage=task_memory)
            task_state_list.append(task_state)

        # Make sure the earliest task starts at 0.
        min_ts_submit = min(task.ts_submit for task in task_list)
        for task in task_list:
            # Update the time
            task.ts_submit -= min_ts_submit
            for parent in task.parents:  # Also since we have all parent info, set them in the same loop
                task_per_taskid[parent].children.add(task.id)

        # Offset task states too
        for taskstate in task_state_list:
            taskstate.ts_start -= min_ts_submit
            taskstate.ts_end -= min_ts_submit

        data_transfer_id = 0
        # Since tasks can output files with the same name as other tasks, we must loop over a task's parents
        # and match the output names against input names.
        for task in task_list:  # For every task we have
            if task.id not in inputs_per_taskid: continue
            inputs = inputs_per_taskid[task.id]
            # We loop over the parents (no need to check children, they will come later)
            for dep in task.parents:
                outputs = outputs_per_taskid[dep] if dep in outputs_per_taskid else set()
                overlap = inputs.intersection(outputs)  # Check for overlap
                if len(overlap) > 0:  # We have input-output pairs, loop to construct datatransfers
                    for file_name in overlap:
                        # Get the size and construct a datatransfer object.
                        data_size = output_file_data_per_task_id[dep][file_name]
                        datatransfer = Datatransfer(data_transfer_id, "local", -1, -1, dep, task.id,
                                                    data_size)
                        # Assign it to the tasks
                        task_per_taskid[dep].datatransfers.append(datatransfer)
                        task.datatransfers.append(datatransfer)
                        outputs_matched[dep][file_name] = True

                        # Remove the file from the input as it's covered. Do NOT remove it from output,
                        # the same output file may be used by another task (fan-out structure).
                        inputs.remove(file_name)
                        data_transfer_id += 1

            # Loop over the remaining input files. Since we do not have a source, we assume them are present
            # on the filesystem beforehand.
            for file_name in inputs:
                data_size = input_file_data_per_task_id[task.id][file_name]
                datatransfer = Datatransfer(data_transfer_id, "local", -1, -1, -1, task.id, data_size)
                task.datatransfers.append(datatransfer)
                data_transfer_id += 1

        # Loop over the outputs and create a datatransfer for those that are not matched yet
        # These are likely files with final results, not having an destination.
        for task_id in outputs_matched.keys():
            for file_name in outputs_matched[task_id].keys():
                if not outputs_matched[task_id][file_name]:
                    task = task_per_taskid[task_id]
                    data_size = output_file_data_per_task_id[task_id][file_name]
                    datatransfer = Datatransfer(data_transfer_id, "local", -1, -1, -1, task.id, data_size)
                    task.datatransfers.append(datatransfer)

        filename_for_this_partition = "part.0.parquet"

        # Write all tasks to parquet
        os.makedirs(os.path.join(TARGET_DIR, Task.output_path()), exist_ok=True)
        task_df = pd.DataFrame([task.get_parquet_dict() for task in task_list])
        task_df.to_parquet(os.path.join(TARGET_DIR, Task.output_path(), filename_for_this_partition), engine="pyarrow")

        # Write all task states to parquet
        os.makedirs(os.path.join(TARGET_DIR, TaskState.output_path()), exist_ok=True)
        task_state_df = pd.DataFrame([task_state.get_parquet_dict() for task_state in task_state_list])
        task_state_df.to_parquet(os.path.join(TARGET_DIR, TaskState.output_path(), filename_for_this_partition),
                                 engine="pyarrow")

        # Write all data transfers to parquet
        if any(len(task.datatransfers) for task in task_list):
            os.makedirs(os.path.join(TARGET_DIR, Datatransfer.output_path()), exist_ok=True)
            datatransfer_df = pd.DataFrame(
                [datatransfer.get_parquet_dict() for task_item in task_list for datatransfer in
                 task_item.datatransfers])

            datatransfer_df.to_parquet(
                os.path.join(TARGET_DIR, Datatransfer.output_path(), filename_for_this_partition),
                engine="pyarrow")

        # Write the workflows to parquet
        wf_agnostic_df = compute_characteristics(task_df)
        workflow_ts_submit = task_df["ts_submit"].min()

        # Determine the application name and field
        application_names = {
            "epigenomics": ("Epigenomics", "Bioinformatics"),
            "montage": ("Montage", "Astronomy"),
            "soykb": ("SoyKB", "Bioinformatics"),
        }

        application_name = ""
        application_field = ""
        for key in application_names.keys():
            if key in file_path:
                application_name = application_names[key][0]
                application_field = application_names[key][1]

        workflow = Workflow(0, workflow_ts_submit, task_list, "Pegasus", "Scientific", application_name,
                            application_field)
        workflow.compute_critical_path()

        wf_df = pd.DataFrame([workflow.get_parquet_dict()])

        return wf_df
Exemplo n.º 7
0
def parse(path_to_dir):
    global TARGET_DIR
    TARGET_DIR = os.path.join(TARGET_DIR, os.path.split(path_to_dir)[1])

    if 'DAS5' in os.environ:  # If we want to execute it on the DAS-5 super computer
        print("We are on DAS5, {0} is master.".format(os.environ['HOSTNAME'] +
                                                      ".ib.cluster"))
        spark = SparkSession.builder \
            .master("spark://" + os.environ['HOSTNAME'] + ".ib.cluster:7077") \
            .appName("WTA parser") \
            .config("spark.executor.memory", "28G") \
            .config("spark.executor.cores", "8") \
            .config("spark.executor.instances", "10") \
            .config("spark.driver.memory", "40G") \
            .config("spark.sql.execution.arrow.enabled", "true") \
            .getOrCreate()
    else:
        findspark.init(spark_home="<path to spark>")
        spark = SparkSession.builder \
            .master("local[8]") \
            .appName("WTA parser") \
            .config("spark.executor.memory", "20G") \
            .config("spark.driver.memory", "8G") \
            .getOrCreate()

    if not os.path.exists(os.path.join(TARGET_DIR, Task.output_path())):
        print("######\nStart parsing Tasks\n######")
        task_df = spark.read.format('com.databricks.spark.csv').options(
            header='true', inferschema='true').load(
                os.path.join(path_to_dir, '*.csv.processed'))

        # Drop the pref table, saving memory and filter out unsuccessful jobs as their information is not reliable
        task_df = task_df.drop('pref').filter(
            task_df.status == ":instance.status/success").drop(
                'status').cache()

        @F.pandas_udf(T.LongType(), F.PandasUDFType.SCALAR)
        def sub_two_datetimes(s1, s2):
            arr = []
            for i in s1.keys():
                d1 = datetime.datetime.strptime(s1[i],
                                                '%a %b %d %H:%M:%S %Z %Y')
                d2 = datetime.datetime.strptime(s2[i],
                                                '%a %b %d %H:%M:%S %Z %Y')
                arr.append(int((d2 - d1).total_seconds() * 1000))

            return pd.Series(arr)

        task_df = task_df \
            .withColumn('wait_time', sub_two_datetimes(F.col('submit-time'), F.col('start-time'))) \
            .withColumn('runtime', sub_two_datetimes(F.col('start-time'), F.col('end-time')))

        @F.pandas_udf(T.LongType(), F.PandasUDFType.SCALAR)
        def date_time_to_unix(series):
            arr = []
            epoch = datetime.datetime.utcfromtimestamp(0)
            for i in series.keys():
                arr.append(
                    np.int64((datetime.datetime.strptime(
                        series[i], '%a %b %d %H:%M:%S %Z %Y') -
                              epoch).total_seconds() * 1000))

            return pd.Series(arr)

        task_df = task_df.withColumn(
            'submit-time',
            date_time_to_unix(F.col('submit-time'))).withColumnRenamed(
                'submit-time',
                "ts_submit").drop('start-time').drop('end-time').cache()

        min_ts = task_df.agg({"ts_submit": "min"}).collect()[0][0]
        task_df = task_df.withColumn('ts_submit',
                                     F.col('ts_submit') - F.lit(min_ts))

        @F.pandas_udf(T.DoubleType(), F.PandasUDFType.SCALAR)
        def convert_to_kb(v):
            return v * 1024

        task_df = task_df.withColumn('memory', convert_to_kb(
            task_df.memory)).withColumnRenamed("memory", "memory_consumption")

        @F.pandas_udf(T.IntegerType(), F.PandasUDFType.SCALAR)
        def string_to_int(v):
            arr = []
            for i in v.keys():
                arr.append(mmh3.hash(v[i], signed=True))

            return pd.Series(arr)

        @F.pandas_udf(T.LongType(), F.PandasUDFType.SCALAR)
        def string_to_long(v):
            arr = []
            for i in v.keys():
                arr.append(mmh3.hash64(v[i], signed=True)[0])

            return pd.Series(arr)

        @F.pandas_udf(T.LongType(), F.PandasUDFType.SCALAR)
        def assign_workflow_ids(v):
            arr = []
            for i in v.keys():
                if v[i]:
                    arr.append(mmh3.hash64(v[i], signed=True)[0])
                else:
                    arr.append(
                        mmh3.hash64(uuid4().bytes, signed=True)
                        [0])  # Assign a UUID, collision chance is negligible.

            return pd.Series(arr)

        task_df = task_df.withColumn('user', string_to_int(
            task_df.user)).withColumnRenamed("user", "user_id")
        task_df = task_df.withColumn('job-uuid',
                                     string_to_long(
                                         F.col('job-uuid'))).withColumnRenamed(
                                             'job-uuid', 'task_id')

        type_udf = F.udf(lambda x: "Independent" if x is None else "Composite",
                         T.StringType())
        task_df = task_df.withColumn('type', type_udf(task_df.simset))

        task_df = task_df.withColumn('simset',
                                     assign_workflow_ids(
                                         F.col('simset'))).withColumnRenamed(
                                             'simset', "workflow_id")
        task_df = task_df.withColumnRenamed('cpu', 'resource_amount_requested')

        task_df = task_df.withColumnRenamed('instance', 'resource_used')

        # Set the static items that are not present in the trace
        task_df = task_df.withColumn('submission_site', F.lit(0))
        task_df = task_df.withColumn('parents',
                                     F.array().cast(T.ArrayType(T.LongType())))
        task_df = task_df.withColumn('children',
                                     F.array().cast(T.ArrayType(T.LongType())))
        task_df = task_df.withColumn('group_id', F.lit(0))
        task_df = task_df.withColumn('nfrs', F.lit("{}"))
        task_df = task_df.withColumn('params', F.lit("{}"))
        task_df = task_df.withColumn('memory_requested', F.lit(-1))
        task_df = task_df.withColumn('network_io_time', F.lit(-1))
        task_df = task_df.withColumn('disk_io_time', F.lit(-1))
        task_df = task_df.withColumn('disk_space_requested', F.lit(-1))
        task_df = task_df.withColumn('energy_consumption', F.lit(-1))

        os.makedirs(os.path.join(TARGET_DIR, Task.output_path()),
                    exist_ok=True)
        task_df.write.parquet(os.path.join(TARGET_DIR, Task.output_path()),
                              mode="overwrite",
                              compression="snappy")
        print("######\nDone parsing Tasks\n######")

    if not os.path.exists(os.path.join(TARGET_DIR, TaskState.output_path())):
        print("######\nStart parsing TaskState\n######")

        if 'task_df' not in locals():
            task_df = spark.read.parquet(
                os.path.join(TARGET_DIR, Task.output_path()))

        task_state_structtype = T.StructType([
            T.StructField("ts_start", T.LongType(), False),
            T.StructField("ts_end", T.LongType(), False),
            T.StructField("workflow_id", T.LongType(), False),
            T.StructField("task_id", T.LongType(), False),
            T.StructField("resource_id", T.LongType(), False),
            T.StructField("cpu_rate", T.DoubleType(), False),
            T.StructField("canonical_memory_usage", T.DoubleType(), False),
            T.StructField("assigned_memory", T.DoubleType(), False),
            T.StructField("minimum_memory_usage", T.DoubleType(), False),
            T.StructField("maximum_memory_usage", T.DoubleType(), False),
            T.StructField("disk_io_time", T.DoubleType(), False),
            T.StructField("maximum_disk_bandwidth", T.DoubleType(), False),
            T.StructField("local_disk_space_usage", T.DoubleType(), False),
            T.StructField("maximum_cpu_rate", T.DoubleType(), False),
            T.StructField("maximum_disk_io_time", T.DoubleType(), False),
            T.StructField("sample_rate", T.DoubleType(), False),
            T.StructField("sample_portion", T.DoubleType(), False),
            T.StructField("sampled_cpu_usage", T.DoubleType(), False),
            T.StructField("network_io_time", T.DoubleType(), False),
            T.StructField("maximum_network_bandwidth", T.DoubleType(), False),
        ])

        @F.pandas_udf(returnType=task_state_structtype,
                      functionType=F.PandasUDFType.GROUPED_MAP)
        def compute_task_states(df):
            workflow_id = df['workflow_id'].iloc[0]
            task_id = df['task_id'].iloc[0]
            ts_start = df['ts_submit'].min()
            ts_end = ts_start + df['runtime'].max()
            resource_id = df['resource_used'].iloc[0]
            cpu_rate = -1
            canonical_memory_usage = df['memory_consumption'].mean()
            assigned_memory = -1
            minimum_memory_usage = df['memory_consumption'].min()
            maximum_memory_usage = df['memory_consumption'].max()
            disk_io_time = -1
            maximum_disk_bandwidth = -1
            local_disk_space_usage = -1
            maximum_cpu_rate = -1
            maximum_disk_io_time = -1
            sample_rate = -1
            sample_portion = -1
            sampled_cpu_usage = -1
            network_io_time = -1
            maximum_network_bandwidth = -1

            data_dict = {
                "ts_start": ts_start,
                "ts_end": ts_end,
                "workflow_id": workflow_id,
                "task_id": task_id,
                "resource_id": resource_id,
                "cpu_rate": cpu_rate,
                "canonical_memory_usage": canonical_memory_usage,
                "assigned_memory": assigned_memory,
                "minimum_memory_usage": minimum_memory_usage,
                "maximum_memory_usage": maximum_memory_usage,
                "disk_io_time": disk_io_time,
                "maximum_disk_bandwidth": maximum_disk_bandwidth,
                "local_disk_space_usage": local_disk_space_usage,
                "maximum_cpu_rate": maximum_cpu_rate,
                "maximum_disk_io_time": maximum_disk_io_time,
                "sample_rate": sample_rate,
                "sample_portion": sample_portion,
                "sampled_cpu_usage": sampled_cpu_usage,
                "network_io_time": network_io_time,
                "maximum_network_bandwidth": maximum_network_bandwidth,
            }

            return pd.DataFrame(data_dict, index=[0])

        task_state_df = task_df.groupBy(['workflow_id',
                                         'task_id']).apply(compute_task_states)
        os.makedirs(os.path.join(TARGET_DIR, TaskState.output_path()),
                    exist_ok=True)
        task_state_df.write.parquet(os.path.join(TARGET_DIR,
                                                 TaskState.output_path()),
                                    mode="overwrite",
                                    compression="snappy")
        print("######\nDone parsing TaskState\n######")

    if not os.path.exists(os.path.join(TARGET_DIR, Resource.output_path())):
        print("######\nStart parsing Resources\n######")

        if 'task_df' not in locals():
            task_df = spark.read.parquet(
                os.path.join(TARGET_DIR, Task.output_path()))

        resource_id_column = [
            i.resource_used
            for i in task_df.select('resource_used').distinct().collect()
        ]

        resources = []
        for resource_id in resource_id_column:
            resources.append(
                Resource(resource_id, 'Cluster Node', 24, '', 256, -1, -1,
                         '').get_parquet_dict())

        resource_df = pd.DataFrame(resources)
        os.makedirs(os.path.join(TARGET_DIR, Resource.output_path()),
                    exist_ok=True)
        resource_df.to_parquet(os.path.join(TARGET_DIR, Resource.output_path(),
                                            'part.0.parquet'),
                               engine="pyarrow")
        print("######\nDone parsing Resources\n######")

    if not os.path.exists(os.path.join(TARGET_DIR, Workflow.output_path())):
        print("######\nStart parsing Workflows\n######")

        if 'task_df' not in locals():
            task_df = spark.read.parquet(
                os.path.join(TARGET_DIR, Task.output_path()))

        workflow_structype = T.StructType([
            T.StructField("id", T.LongType(), False),
            T.StructField("ts_submit", T.LongType(), False),
            T.StructField("task_count", T.IntegerType(), False),
            T.StructField("critical_path_length", T.LongType(), False),
            T.StructField("critical_path_task_count", T.IntegerType(), False),
            T.StructField("approx_max_concurrent_tasks", T.IntegerType(),
                          False),
            T.StructField("nfrs", T.StringType(), False),
            T.StructField("scheduler", T.StringType(), False),
            T.StructField("total_resources", T.DoubleType(), False),
            T.StructField("total_memory_usage", T.DoubleType(), False),
            T.StructField("total_network_usage", T.LongType(), False),
            T.StructField("total_disk_space_usage", T.LongType(), False),
            T.StructField("total_energy_consumption", T.LongType(), False),
        ])

        @F.pandas_udf(returnType=workflow_structype,
                      functionType=F.PandasUDFType.GROUPED_MAP)
        def compute_workflow_stats(df):
            id = df['workflow_id'].iloc[0]
            ts_submit = df['ts_submit'].min()
            task_count = len(df)
            critical_path_length = -1
            critical_path_task_count = -1
            approx_max_concurrent_tasks = -1
            nfrs = "{}"
            scheduler = "Cook"
            total_resources = df['resource_amount_requested'].sum()
            total_memory_usage = df['memory_consumption'].sum()
            total_network_usage = -1
            total_disk_space_usage = -1
            total_energy_consumption = -1

            data_dict = {
                "id": id,
                "ts_submit": ts_submit,
                'task_count': task_count,
                'critical_path_length': critical_path_length,
                'critical_path_task_count': critical_path_task_count,
                'approx_max_concurrent_tasks': approx_max_concurrent_tasks,
                'nfrs': nfrs,
                'scheduler': scheduler,
                'total_resources': total_resources,
                'total_memory_usage': total_memory_usage,
                'total_network_usage': total_network_usage,
                'total_disk_space_usage': total_disk_space_usage,
                'total_energy_consumption': total_energy_consumption
            }

            return pd.DataFrame(data_dict, index=[0])

        workflow_df = task_df.groupBy('workflow_id').apply(
            compute_workflow_stats)
        workflow_df.explain(True)
        workflow_df.write.parquet(os.path.join(TARGET_DIR,
                                               Workflow.output_path()),
                                  mode="overwrite",
                                  compression="snappy")
        print("######\nDone parsing Workflows\n######")

    print("######\nStart parsing Ẁorkload\n######")
    pandas_task_df = pd.read_parquet(os.path.join(TARGET_DIR,
                                                  Task.output_path()),
                                     engine="pyarrow")
    json_dict = Workload.get_json_dict_from_pandas_task_dataframe(
        pandas_task_df,
        domain="Industrial",
        start_date=None,
        end_date=None,
        authors=["Two Sigma"])

    os.makedirs(os.path.join(TARGET_DIR, Workload.output_path()),
                exist_ok=True)

    with open(
            os.path.join(TARGET_DIR, Workload.output_path(),
                         "generic_information.json"), "w") as file:
        # Need this on 32-bit python.
        def default(o):
            if isinstance(o, np.int64):
                return int(o)

        file.write(json.dumps(json_dict, default=default))
def parse_workflow(wf, filename):
    workflow_id = string2numeric_hash(wf['name'] + '-(' + wf['id'] + ')')
    workflow_domain = ""  # The domain the workflow belongs to, e.g. industry, science, etc.
    workflow_application_name = ""  # The name of the application, e.g. Montage, SIPHT
    workflow_appliation_field = ""  # The field of the application, e.g. bioinformatics, astronomy
    if "bwa" in filename.lower():
        workflow_id = string2numeric_hash("bwa" + '-(' + wf['id'] + ')')
        workflow_domain = "science"
        workflow_application_name = "Burroughs-Wheeler Alignment tool"
        workflow_appliation_field = "bioinformatics"
    elif "wien2k" in filename.lower():
        workflow_id = string2numeric_hash("wien2k" + '-(' + wf['id'] + ')')
        workflow_domain = "science"
        workflow_application_name = "Wien2k"
        workflow_appliation_field = "materials chemistry"
    resources = {}
    for r in wf['resources']:  # parse resources for tasks later
        r_details = r['details']
        os = "Linux"
        details = {}
        details['provider'] = r_details['provider']
        details['instanceType'] = r_details['instanceType']
        events = parse_events(r['events'])
        # id, type, num_resources, proc_model_name, memory, disk_space, network_bandwidth, operating_system, details=None, events=None
        resources[string2numeric_hash(r['id'])] = Resource(
            string2numeric_hash(r['name']), r['type'], r_details['vCPUs'],
            r_details['physicalProcessor'], r_details['memory'],
            r_details['storage'], r_details['networkPerformance'], os, details,
            events)
    # create list of tasks for parents
    if "wien2k" in filename.lower():
        first = []
        second = []
        third = []
        fourth = []
        last = []
        for t in wf['tasks']:
            if "first" in t['name'].lower():
                first.append(
                    string2numeric_hash(str(t['name'] + '-(' + t['id'] + ')')))
            if "second" in t['name'].lower():
                second.append(
                    string2numeric_hash(str(t['name'] + '-(' + t['id'] + ')')))
            if "third" in t['name'].lower():
                third.append(
                    string2numeric_hash(str(t['name'] + '-(' + t['id'] + ')')))
            if "fourth" in t['name'].lower():
                fourth.append(
                    string2numeric_hash(str(t['name'] + '-(' + t['id'] + ')')))
            if "last" in t['name'].lower():
                last.append(
                    string2numeric_hash(str(t['name'] + '-(' + t['id'] + ')')))
    elif "bwa" in filename.lower():
        bwaindex_split1_2 = []
        bwa1aln = []
        bwaconcat = []
        for t in wf['tasks']:
            if "bwa:bwaindex" in t['name'].lower():
                bwaindex_split1_2.append(
                    string2numeric_hash(str(t['name'] + '-(' + t['id'] + ')')))
            if "bwa:split1" in t['name'].lower():
                bwaindex_split1_2.append(
                    string2numeric_hash(str(t['name'] + '-(' + t['id'] + ')')))
            if "bwa:split2" in t['name'].lower():
                bwaindex_split1_2.append(
                    string2numeric_hash(str(t['name'] + '-(' + t['id'] + ')')))
            if "bwa:bwa1aln" in t['name'].lower():
                bwa1aln.append(
                    string2numeric_hash(str(t['name'] + '-(' + t['id'] + ')')))
            if "bwa:concat" in t['name'].lower():
                bwaconcat.append(
                    string2numeric_hash(str(t['name'] + '-(' + t['id'] + ')')))
    tasks = []
    for t in wf['tasks']:  # parse tasks
        if "cloud init" not in t['name'].lower(
        ) and "cloud instances" not in t['name'].lower(
        ) and "parforiteration" not in t['type'].lower(
        ) and "parallel" not in t['type'].lower(
        ) and "section" not in t['type'].lower():
            parents = []
            if "wien2k" in filename.lower():
                if "second" in t['name'].lower():
                    parents = first
                if "third" in t['name'].lower():
                    parents = second
                if "fourth" in t['name'].lower():
                    parents = third
                if "last" in t['name'].lower():
                    parents = fourth
            elif "bwa" in filename.lower():
                if "bwa:bwa1aln" in t['name'].lower():
                    parents = bwaindex_split1_2
                if "bwa:concat" in t['name'].lower():
                    parents = bwa1aln
            # print(parents)
            submission_site = string2numeric_hash(t['submissionSite'])
            res = None
            if submission_site != '':
                res = resources[submission_site]
            params = parse_params(t['params'])
            events = parse_events(t['events'])
            wait_time = 0
            if "ACTIVE" in events:
                wait_time = int((parse(events["ACTIVE"]) -
                                 parse(t['startTime'])).total_seconds() * 1000)
            # id, type, ts_submit,
            # submission_site, runtime, resource_amount_requested, parents,
            # workflow_id, wait_time, resource_type="cpu", resource=None, datatransfer=None, params=None, events=None, requirements=None, user_id=-1, group_id=-1, memory_requested=-1, disk_space_requested=-1, disk_io_time=-1, network_io_time=-1, energy_consumption=-1
            tasks.append(
                Task(
                    string2numeric_hash(str(t['name'] + '-(' + t['id'] + ')')),
                    t['type'], int(parse(t['startTime']).timestamp() * 1000),
                    submission_site,
                    int((parse(t['endTime']) -
                         parse(t['startTime'])).total_seconds() * 1000), 1,
                    parents, workflow_id, wait_time, "CPU", res,
                    parse_datatransfers(t['fileTransfers']), params, events))
    ts_start = min(t['startTime'] for t in wf['tasks'])
    # id, ts_submit, tasks, scheduler_description
    return Workflow(workflow_id,
                    int(parse(wf['beginTime']).timestamp() * 1000), tasks,
                    wf['scheduler'], workflow_domain,
                    workflow_application_name, workflow_appliation_field)