def get_kde_date_attribute_json(values, parameters=None): """ Gets the KDE estimation for the distribution of a date attribute values (expressed as JSON) Parameters -------------- values Values of the date attribute value parameters Possible parameters of the algorithm, including: graph_points: number of points to include in the graph Returns -------------- json JSON representing the graph points """ x, y = get_kde_date_attribute(values, parameters=parameters) dt0 = datetime(1970, 1, 1) needs_conversion = check_pandas_ge_024() if needs_conversion: dt0 = dt0.replace(tzinfo=pytz.utc) dt0 = pd.to_datetime(dt0, utc=True) ret = [] for i in range(len(x)): ret.append(((x[i].replace(tzinfo=None) - dt0.replace(tzinfo=None)).total_seconds(), y[i])) return json.dumps(ret)
def filter_traces_intersecting(df, dt1, dt2, parameters=None): """ Filter traces intersecting the given interval Parameters ---------- df Pandas dataframe dt1 Lower bound to the interval (possibly expressed as string, but automatically converted) dt2 Upper bound to the interval (possibly expressed as string, but automatically converted) parameters Possible parameters of the algorithm, including: Parameters.TIMESTAMP_KEY -> Attribute to use as timestamp Parameters.CASE_ID_KEY -> Column that contains the timestamp Returns ---------- df Filtered dataframe """ if parameters is None: parameters = {} timestamp_key = exec_utils.get_param_value(Parameters.TIMESTAMP_KEY, parameters, DEFAULT_TIMESTAMP_KEY) case_id_glue = exec_utils.get_param_value(Parameters.CASE_ID_KEY, parameters, CASE_CONCEPT_NAME) dt1 = get_dt_from_string(dt1) dt2 = get_dt_from_string(dt2) needs_conversion = check_pandas_ge_024() if needs_conversion: dt1 = dt1.replace(tzinfo=pytz.utc) dt2 = dt2.replace(tzinfo=pytz.utc) dt1 = pd.to_datetime(dt1, utc=True) dt2 = pd.to_datetime(dt2, utc=True) grouped_df = df[[case_id_glue, timestamp_key]].groupby(df[case_id_glue]) first = grouped_df.first() last = grouped_df.last() last.columns = [str(col) + '_2' for col in last.columns] stacked = pd.concat([first, last], axis=1) stacked1 = stacked[stacked[timestamp_key] > dt1] stacked1 = stacked1[stacked1[timestamp_key] < dt2] stacked2 = stacked[stacked[timestamp_key + "_2"] > dt1] stacked2 = stacked2[stacked2[timestamp_key + "_2"] < dt2] stacked3 = stacked[stacked[timestamp_key] < dt1] stacked3 = stacked3[stacked3[timestamp_key + "_2"] > dt2] stacked = pd.concat([stacked1, stacked2, stacked3], axis=0) i1 = df.set_index(case_id_glue).index i2 = stacked.set_index(case_id_glue).index return df[i1.isin(i2)]
def filter_traces_contained(df, dt1, dt2, parameters=None): """ Get traces that are contained in the given interval Parameters ---------- df Pandas dataframe dt1 Lower bound to the interval (possibly expressed as string, but automatically converted) dt2 Upper bound to the interval (possibly expressed as string, but automatically converted) parameters Possible parameters of the algorithm, including: timestamp_key -> Attribute to use as timestamp case_id_glue -> Column that contains the timestamp Returns ---------- df Filtered dataframe """ if parameters is None: parameters = {} timestamp_key = parameters[ PARAMETER_CONSTANT_TIMESTAMP_KEY] if PARAMETER_CONSTANT_TIMESTAMP_KEY in parameters else DEFAULT_TIMESTAMP_KEY case_id_glue = parameters[ PARAMETER_CONSTANT_CASEID_KEY] if PARAMETER_CONSTANT_CASEID_KEY in parameters else CASE_CONCEPT_NAME dt1 = get_dt_from_string(dt1) dt2 = get_dt_from_string(dt2) needs_conversion = check_pandas_ge_024() if needs_conversion: dt1 = dt1.replace(tzinfo=pytz.utc) dt2 = dt2.replace(tzinfo=pytz.utc) dt1 = pd.to_datetime(dt1, utc=True) dt2 = pd.to_datetime(dt2, utc=True) grouped_df = df[[case_id_glue, timestamp_key]].groupby(df[case_id_glue]) first = grouped_df.first() last = grouped_df.last() last.columns = [str(col) + '_2' for col in last.columns] stacked = pd.concat([first, last], axis=1) stacked = stacked[stacked[timestamp_key] > dt1] stacked = stacked[stacked[timestamp_key + "_2"] < dt2] i1 = df.set_index(case_id_glue).index i2 = stacked.set_index(case_id_glue).index return df[i1.isin(i2)]
def convert_timestamp_columns_in_df(df, timest_format=None, timest_columns=None): """ Convert all dataframe columns in a dataframe Parameters ----------- df Dataframe timest_format (If provided) Format of the timestamp columns in the CSV file timest_columns Columns of the CSV that shall be converted into timestamp Returns ------------ df Dataframe with timestamp columns converted """ needs_conversion = check_pandas_ge_024() for col in df.columns: if timest_columns is None or col in timest_columns: if df[col].dtype == 'object': try: if timest_format is None: if needs_conversion: df[col] = pd.to_datetime(df[col], utc=True) else: df[col] = pd.to_datetime(df[col]) else: if needs_conversion: df[col] = pd.to_datetime(df[col], utc=True, format=timest_format) else: df[col] = pd.to_datetime(df[col]) except ValueError: # print("exception converting column: "+str(col)) pass return df
def apply_events(df, dt1, dt2, parameters=None): """ Get a new log containing all the events contained in the given interval Parameters ---------- df Pandas dataframe dt1 Lower bound to the interval (possibly expressed as string, but automatically converted) dt2 Upper bound to the interval (possibly expressed as string, but automatically converted) parameters Possible parameters of the algorithm, including: Parameters.TIMESTAMP_KEY -> Attribute to use as timestamp Returns ---------- df Filtered dataframe """ if parameters is None: parameters = {} timestamp_key = exec_utils.get_param_value(Parameters.TIMESTAMP_KEY, parameters, DEFAULT_TIMESTAMP_KEY) dt1 = get_dt_from_string(dt1) dt2 = get_dt_from_string(dt2) needs_conversion = check_pandas_ge_024() if needs_conversion: dt1 = dt1.replace(tzinfo=pytz.utc) dt2 = dt2.replace(tzinfo=pytz.utc) dt1 = pd.to_datetime(dt1, utc=True) dt2 = pd.to_datetime(dt2, utc=True) df = df[df[timestamp_key] > dt1] df = df[df[timestamp_key] < dt2] return df