示例#1
0
def execute_script():
    try:
        from pm4py.objects.log.importer.parquet import factory as parquet_importer
        from pm4py.algo.discovery.dfg.adapters.pandas import df_statistics

        log_path = os.path.join("..", "tests", "input_data", log_name)
        time1 = time.time()
        dataframe = parquet_importer.apply(
            log_path, parameters={"columns": allowed_columns})
        time2 = time.time()
        print(dataframe.columns)
        print(
            "time interlapsed importing " + log_name + " on columns " +
            str(allowed_columns) + ": ", (time2 - time1))
        dfg1 = df_statistics.get_dfg_graph(dataframe,
                                           sort_timestamp_along_case_id=False)
        time3 = time.time()
        print(
            "time interlapsed calculating the DFG on columns " +
            str(allowed_columns) + " : ", (time3 - time2))
        del dataframe
        time4 = time.time()
        dataframe = parquet_importer.apply(log_path)
        print(dataframe.columns)
        time5 = time.time()
        print("time interlapsed importing " + log_name + " (all columns): ",
              (time5 - time4))
        dfg2 = df_statistics.get_dfg_graph(dataframe,
                                           sort_timestamp_along_case_id=False)
        time6 = time.time()
        print("time interlapsed calculating the DFG on all columns : ",
              (time6 - time5))
    except:
        traceback.print_exc()
示例#2
0
def load_parquet_from_path(path,
                           columns,
                           filters,
                           use_transition=False,
                           force_classifier_insertion=False,
                           force_timestamp_conversion=False,
                           parameters=None):
    if parameters is None:
        parameters = {}
    if filters is None:
        filters = []
    if columns is None:
        columns = []
        df = parquet_importer.apply(path)
    else:
        df = parquet_importer.apply(path, parameters={"columns": columns})

    if DEFAULT_TIMESTAMP_KEY in columns and (filters
                                             or force_timestamp_conversion):
        df[DEFAULT_TIMESTAMP_KEY] = pd.to_datetime(df[DEFAULT_TIMESTAMP_KEY],
                                                   utc=True)

    if use_transition:
        df = insert_classifier(df)
    elif force_classifier_insertion:
        df["@@classifier"] = df[DEFAULT_NAME_KEY]

    return df
 def test_importExportParquet(self):
     # to avoid static method warnings in tests,
     # that by construction of the unittest package have to be expressed in such way
     from pm4py.objects.log.importer.parquet import factory as parquet_importer
     from pm4py.objects.log.exporter.parquet import factory as parquet_exporter
     self.dummy_variable = "dummy_value"
     df1 = parquet_importer.apply(
         os.path.join(INPUT_DATA_DIR, "running-example.parquet"))
     parquet_exporter.export_log(
         df1, os.path.join(OUTPUT_DATA_DIR, "running-example.parquet"))
     df2 = parquet_importer.apply(
         os.path.join(OUTPUT_DATA_DIR, "running-example.parquet"))
     self.assertEqual(len(df1), len(df2))
     os.remove(os.path.join(OUTPUT_DATA_DIR, "running-example.parquet"))
示例#4
0
 def test_importing_parquet(self):
     from pm4py.objects.log.importer.parquet import factory as parquet_importer
     df = parquet_importer.apply(os.path.join("input_data",
                                              "receipt.parquet"),
                                 variant="pyarrow")
     df = parquet_importer.apply(os.path.join("input_data",
                                              "receipt.parquet"),
                                 variant="fastparquet")
     log = parquet_importer.import_log(os.path.join(
         "input_data", "running-example.parquet"),
                                       variant="pyarrow")
     log = parquet_importer.import_minimal_log(os.path.join(
         "input_data", "running-example.parquet"),
                                               variant="pyarrow")
示例#5
0
    def build_from_path(self, path, parameters=None):
        """
        Builds the handler from the specified path to Parquet file

        Parameters
        -------------
        path
            Path to the log file
        parameters
            Parameters of the algorithm
        """
        if parameters is None:
            parameters = {}
        self.dataframe = parquet_importer.apply(path)
        # TODO: verify if this is the best way to act
        self.dataframe[DEFAULT_TIMESTAMP_KEY] = pd.to_datetime(self.dataframe[DEFAULT_TIMESTAMP_KEY], utc=True)
        self.postloading_processing_dataframe()
        self.dataframe = self.dataframe.sort_values([DEFAULT_TIMESTAMP_KEY, ws_constants.DEFAULT_EVENT_INDEX_KEY])
        if not str(self.dataframe[CASE_CONCEPT_NAME].dtype) == "object":
            self.dataframe[CASE_CONCEPT_NAME] = self.dataframe[CASE_CONCEPT_NAME].astype(str)
        if not ws_constants.DEFAULT_CASE_INDEX_KEY in self.dataframe:
            self.dataframe[ws_constants.DEFAULT_CASE_INDEX_KEY] = self.dataframe.groupby(CASE_CONCEPT_NAME).ngroup()
        if not self.is_lazy:
            self.sort_dataframe_by_case_id()
            self.build_reduced_dataframe()
            self.build_variants_df()
            self.build_grouped_dataframe()
            self.build_reduced_grouped_dataframe()
            self.calculate_events_number()
            self.calculate_variants_number()
            self.calculate_cases_number()
示例#6
0
    def build_from_path(self, path, parameters=None):
        """
        Builds the handler from the specified path to Parquet file

        Parameters
        -------------
        path
            Path to the log file
        parameters
            Parameters of the algorithm
        """
        if parameters is None:
            parameters = {}
        self.dataframe = parquet_importer.apply(path)
        # TODO: verify if this is the best way to act
        self.dataframe[DEFAULT_TIMESTAMP_KEY] = pd.to_datetime(
            self.dataframe[DEFAULT_TIMESTAMP_KEY], utc=True)
        self.postloading_processing_dataframe()
        self.reduced_dataframe = self.dataframe[[
            CASE_CONCEPT_NAME, self.activity_key, DEFAULT_TIMESTAMP_KEY
        ]]
        self.build_variants_df()
        self.grouped_dataframe = self.dataframe.groupby(CASE_CONCEPT_NAME)
        self.reduced_grouped_dataframe = self.reduced_dataframe.groupby(
            CASE_CONCEPT_NAME)
        self.calculate_events_number()
        self.calculate_variants_number()
        self.calculate_cases_number()
示例#7
0
def execute_script():
    log_path = os.path.join("..", "tests", "input_data", log_name)
    time1 = time.time()
    dataframe = parquet_importer.apply(log_path, parameters={"columns": allowed_columns})
    time2 = time.time()
    print(dataframe.columns)
    print("time interlapsed importing "+log_name+" on columns "+str(allowed_columns)+": ",(time2-time1))
    dfg1 = df_statistics.get_dfg_graph(dataframe, sort_timestamp_along_case_id=False)
    time3 = time.time()
    print("time interlapsed calculating the DFG on columns "+str(allowed_columns)+" : ",(time3-time2))
    del dataframe
    time4 = time.time()
    dataframe = parquet_importer.apply(log_path)
    print(dataframe.columns)
    time5 = time.time()
    print("time interlapsed importing "+log_name+" (all columns): ",(time5-time4))
    dfg2 = df_statistics.get_dfg_graph(dataframe, sort_timestamp_along_case_id=False)
    time6 = time.time()
    print("time interlapsed calculating the DFG on all columns : ",(time6-time5))
示例#8
0
    def __init__(self, name, file: str = None, log: pd.DataFrame = None):
        if (file is None and log is None) or (file is not None and log is not None):
            raise Exception(
                "You must either provide a file to load or pass a log object.")
        if file:
            self.log = parquet_importer.apply(file)
        else:
            self.log = log

        self.name: str = name
        self.id: str = str(uuid.uuid4())
示例#9
0
def get_events(path, log_name, managed_logs, parameters=None):
    if parameters is None:
        parameters = {}

    no_samples = parameters[
        PARAMETER_NO_SAMPLES] if PARAMETER_NO_SAMPLES in parameters else DEFAULT_MAX_NO_SAMPLES
    use_transition = parameters[
        PARAMETER_USE_TRANSITION] if PARAMETER_USE_TRANSITION in parameters else DEFAULT_USE_TRANSITION
    activity_key = DEFAULT_NAME_KEY if not use_transition else PARAMETER_PM4PYWS_CLASSIFIER
    filters = parameters[FILTERS] if FILTERS in parameters else []
    parameters[pm4py_constants.PARAMETER_CONSTANT_ACTIVITY_KEY] = activity_key
    parameters[pm4py_constants.PARAMETER_CONSTANT_ATTRIBUTE_KEY] = activity_key

    case_id = parameters["case_id"]

    folder = os.path.join(path, log_name)

    parquet_list = parquet_importer.get_list_parquet(folder)

    ret = []

    count = 0
    for index, pq in enumerate(parquet_list):
        pq_basename = Path(pq).name
        if pq_basename in managed_logs:
            count = count + 1

            df = get_filtered_parquet(pq,
                                      None,
                                      filters,
                                      use_transition=use_transition,
                                      parameters=parameters)

            try:
                events = case_statistics.get_events(df, case_id)
                if len(events) > 0:
                    df = parquet_importer.apply(pq)
                    ret = df[df[CASE_CONCEPT_NAME] == case_id].dropna(
                        how="all", axis=1)
                    if activity_key != PARAMETER_PM4PYWS_CLASSIFIER:
                        ret[PARAMETER_PM4PYWS_CLASSIFIER] = ret[activity_key]
                    ret = ret.to_dict('r')
                    break
            except:
                pass

            if count >= no_samples:
                break

    return ret
def import_from_file(file_path, parameters=None):
    """
    Apply the deserialization to a file produced by Pyarrow serialization

    Parameters
    --------------
    file_path
        File path
    parameters
        Parameters of the algorithm

    Returns
    --------------
    deser
        Deserialized object
    """
    if parameters is None:
        parameters = {}

    from pm4py.objects.log.importer.parquet import factory as parquet_importer
    dataframe = parquet_importer.apply(file_path, parameters=parameters)

    return dataframe
示例#11
0
from analysis.Correlation import Correlation
from analysis.Regression import Regression
from utils.Configuration import Configuration

# from evaluation.Prediction import Prediction

dataset_path = os.path.join('/workspaces/data/BPIC-17',
                            'BPI_Challenge_2017.parquet')
# dataset_path = os.path.join('/workspaces/data/BPIC-12',
# 'BPI_Challenge_2012.parquet')
# dataset_path = os.path.join('all_wl30min_psInSecMax7200_dt.parquet')
# dataset_path = os.path.join('top20_wl30min_psInSecMax7200_dt_bpi12.parquet')

ALREADY_ANALYSED = False

log = parquet_importer.apply(dataset_path)

# print(log["concept:name"].value_counts())
# print(get_resources(log, as_dict=True))
# exit()

OUTPUT_PATH = "results/"

# Currently we have to use a multiindex due to duplicates in the timestamps (at least pandas says so)
# log.set_index('time:timestamp', inplace=True, append=True, drop=False)
# log.set_index('time:timestamp', inplace=True, verify_integrity=True, append=True, drop=False)
if not ALREADY_ANALYSED:
    log.set_index('time:timestamp', inplace=True, drop=False)
    log.sort_index(inplace=True)

    # Filter for Worklow Events only (Offer and Application do not have a duration)
示例#12
0
def transform_csv_dataset_to_parquet_distr_dataset(source_path, target_path, target_num_partitions, activity_key=xes.DEFAULT_NAME_KEY, timestamp_key=xes.DEFAULT_TIMESTAMP_KEY, caseid_key=CASE_CONCEPT_NAME, parameters=None):
    """
    Transforms the CSV dataset to a Parquet distributed dataset

    Parameters
    -------------
    source_path
        Source path (several CSV dataset)
    target_path
        Target path (distributed Parquet dataset)
    target_num_partitions
        Target number of partitions (number of divisions of the output)
    activity_key
        Column that is the activity
    timestamp_key
        Column that is the timestamp
    caseid_key
        Column that is the case ID
    parameters
        Possible parameters of the algorithm, including:
            - sep: the separator
            - quotechar: the quotechar
            - encoding: the encoding
            - timest_columns: the list of column that contain timestamp
            - timest_format: the format of ALL the timest_columns

    Returns
    -------------
    void
    """
    if parameters is None:
        parameters = {}

    # create the folder
    try:
        os.mkdir(target_path)
    except:
        pass

    # create the partitions
    dataframe = pd.DataFrame({})
    for i in range(target_num_partitions):
        tp = os.path.join(target_path, str(i)+".parquet")
        parquet_exporter.apply(dataframe, tp)
    files = os.listdir(source_path)
    for index, file in enumerate(files):
        if file.lower().endswith("csv"):
            sp = os.path.join(source_path, file)
            source_df = csv_importer.import_dataframe_from_path(sp, parameters=parameters)
            if activity_key != xes.DEFAULT_NAME_KEY and xes.DEFAULT_NAME_KEY not in source_df.columns:
                source_df[xes.DEFAULT_NAME_KEY] = source_df[activity_key]
            if timestamp_key != xes.DEFAULT_TIMESTAMP_KEY and xes.DEFAULT_TIMESTAMP_KEY not in source_df.columns:
                source_df[xes.DEFAULT_TIMESTAMP_KEY] = source_df[timestamp_key]
            if caseid_key != CASE_CONCEPT_NAME  and CASE_CONCEPT_NAME not in source_df.columns:
                source_df[CASE_CONCEPT_NAME] = source_df[caseid_key]
            source_df["@@partition"] = source_df[caseid_key].apply(hash)
            source_df["@@partition"] = source_df["@@partition"] % target_num_partitions
            for i in range(target_num_partitions):
                tp = os.path.join(target_path, str(i)+".parquet")
                df2 = source_df[source_df["@@partition"] == i]
                del df2["@@partition"]
                #df2 = df2.reset_index()
                df1 = parquet_importer.apply(tp)
                df = pd.concat([df1, df2])
                if index == len(files)-1:
                    df = df.sort_values([caseid_key, timestamp_key])
                print("input %d/%d output %d/%d len(df)=" % (index+1,len(files),i+1,target_num_partitions),len(df))
                parquet_exporter.apply(df, tp)