Exemplo n.º 1
0
def parse_workload(pegasus_db_path, filename=NAME, workload_domain="", workload_description=""):
    if not os.path.exists(TARGET_DIR):
        os.makedirs(TARGET_DIR)

    conn = sqlite3.connect(pegasus_db_path)
    c = conn.cursor()

    workflows = parse_workflows(c)

    for w in workflows:
        w.compute_critical_path()

    # Write the workflow objects to parquet
    os.makedirs(os.path.join(TARGET_DIR, Workflow.output_path()), exist_ok=True)
    workflow_df = pd.DataFrame([workflow.get_parquet_dict() for workflow in workflows])
    workflow_df.to_parquet(os.path.join(TARGET_DIR, Workflow.output_path(), "part.0.parquet"), engine="pyarrow")

    # Write all tasks to parquet
    os.makedirs(os.path.join(TARGET_DIR, Task.output_path()), exist_ok=True)
    task_df = pd.DataFrame([task.get_parquet_dict() for wf in workflows for task in wf.tasks])
    # Make sure the first workflow is submitted at time 0
    min_submit_time = task_df["ts_submit"].min()
    task_df = task_df.assign(ts_submit=lambda x: x['ts_submit'] - min_submit_time)

    pyarrow_task_schema = Task.get_pyarrow_schema()
    table = pa.Table.from_pandas(task_df, schema=pyarrow_task_schema, preserve_index=False)

    # Pandas does not know the different between an empty list and a list with integers
    # Thus, type mismatches will occur. We are writing the task tables using pyarrow directly
    # using a schema.
    pq.write_table(table, os.path.join(TARGET_DIR, Task.output_path(), "part.0.parquet"))

    # generate workload description
    authors_list = []

    w = Workload(workflows, workload_domain, authors_list, workload_description)

    # Write a json dict with the workload properties
    json_dict = Workload.get_json_dict_from_pandas_task_dataframe(task_df,
                                                                  domain="Scientific",
                                                                  authors=["Pegasus Team"],
                                                                  workload_description=""  # TODO fill in
                                                                  )

    os.makedirs(os.path.join(TARGET_DIR, Workload.output_path()), exist_ok=True)

    with open(os.path.join(TARGET_DIR, Workload.output_path(), "generic_information.json"), "w") as file:
        # Need this on 32-bit python.
        def default(o):
            if isinstance(o, np.int64): return int(o)
            raise TypeError

        file.write(json.dumps(json_dict, default=default))

    conn.close()
def parse_askalon_file(askalon_file):
    if not os.path.exists(TARGET_DIR):
        os.makedirs(TARGET_DIR)

    workflows = []
    with open(askalon_file, 'r') as asklon_trace:
        data = json.load(asklon_trace)
        for wf in data:
            workflows.append(parse_workflow(wf, askalon_file))

    for w in workflows:
        w.compute_critical_path()

    # Write the workflow objects to parquet
    os.makedirs(os.path.join(TARGET_DIR, Workflow.output_path()),
                exist_ok=True)
    workflow_df = pd.DataFrame(
        [workflow.get_parquet_dict() for workflow in workflows])
    workflow_df.to_parquet(os.path.join(TARGET_DIR, Workflow.output_path(),
                                        "part.0.parquet"),
                           engine="pyarrow")

    # Write all tasks to parquet
    os.makedirs(os.path.join(TARGET_DIR, Task.output_path()), exist_ok=True)
    task_df = pd.DataFrame(
        [task.get_parquet_dict() for wf in workflows for task in wf.tasks])
    # Make sure the first workflow is submitted at time 0
    min_submit_time = task_df["ts_submit"].min()
    task_df = task_df.assign(
        ts_submit=lambda x: x['ts_submit'] - min_submit_time)

    pyarrow_task_schema = Task.get_pyarrow_schema()
    table = pa.Table.from_pandas(task_df,
                                 schema=pyarrow_task_schema,
                                 preserve_index=False)

    # Pandas does not know the different between an empty list and a list with integers
    # Thus, type mismatches will occur. We are writing the task tables using pyarrow directly
    # using a schema.
    pq.write_table(
        table, os.path.join(TARGET_DIR, Task.output_path(), "part.0.parquet"))

    # generate workload description
    authors_list = ["Roland Matha", "Radu Prodan"]

    # generate workload description
    workload_description = ""
    if "bwa" in askalon_file.lower():
        workload_description = "BWA (short for Burroughs-Wheeler Alignment tool) is a genomics analysis workflow, courtesy of Scott Emrich and Notre Dame Bioinformatics Laboratory. It maps low-divergent sequences against a large reference genome, such as the human genome."
    elif "wien2k" in askalon_file.lower():
        workload_description = "Wien2k uses a full-potential Linearized Augmented Plane Wave (LAPW) approach for the computation of crystalline solids."

    workload_domain = "Scientific"

    w = Workload(workflows, workload_domain, authors_list,
                 workload_description)

    # Write a json dict with the workload properties
    json_dict = Workload.get_json_dict_from_pandas_task_dataframe(
        task_df,
        domain=workload_domain,
        authors=authors_list,
        workload_description=workload_description)

    os.makedirs(os.path.join(TARGET_DIR, Workload.output_path()),
                exist_ok=True)

    with open(
            os.path.join(TARGET_DIR, Workload.output_path(),
                         "generic_information.json"), "w") as file:
        # Need this on 32-bit python.
        def default(o):
            if isinstance(o, np.int64): return int(o)
            raise TypeError

        file.write(json.dumps(json_dict, default=default))