def get_kde_date_attribute_json(values, parameters=None):
    """
    Gets the KDE estimation for the distribution of a date attribute values
    (expressed as JSON)

    Parameters
    --------------
    values
        Values of the date attribute value
    parameters
        Possible parameters of the algorithm, including:
            graph_points: number of points to include in the graph

    Returns
    --------------
    json
        JSON representing the graph points
    """
    x, y = get_kde_date_attribute(values, parameters=parameters)

    dt0 = datetime(1970, 1, 1)
    needs_conversion = check_pandas_ge_024()

    if needs_conversion:
        dt0 = dt0.replace(tzinfo=pytz.utc)
        dt0 = pd.to_datetime(dt0, utc=True)

    ret = []
    for i in range(len(x)):
        ret.append(((x[i].replace(tzinfo=None) -
                     dt0.replace(tzinfo=None)).total_seconds(), y[i]))

    return json.dumps(ret)
示例#2
0
def filter_traces_intersecting(df, dt1, dt2, parameters=None):
    """
    Filter traces intersecting the given interval

    Parameters
    ----------
    df
        Pandas dataframe
    dt1
        Lower bound to the interval (possibly expressed as string, but automatically converted)
    dt2
        Upper bound to the interval (possibly expressed as string, but automatically converted)
    parameters
        Possible parameters of the algorithm, including:
            Parameters.TIMESTAMP_KEY -> Attribute to use as timestamp
            Parameters.CASE_ID_KEY -> Column that contains the timestamp

    Returns
    ----------
    df
        Filtered dataframe
    """
    if parameters is None:
        parameters = {}
    timestamp_key = exec_utils.get_param_value(Parameters.TIMESTAMP_KEY,
                                               parameters,
                                               DEFAULT_TIMESTAMP_KEY)
    case_id_glue = exec_utils.get_param_value(Parameters.CASE_ID_KEY,
                                              parameters, CASE_CONCEPT_NAME)
    dt1 = get_dt_from_string(dt1)
    dt2 = get_dt_from_string(dt2)
    needs_conversion = check_pandas_ge_024()
    if needs_conversion:
        dt1 = dt1.replace(tzinfo=pytz.utc)
        dt2 = dt2.replace(tzinfo=pytz.utc)
        dt1 = pd.to_datetime(dt1, utc=True)
        dt2 = pd.to_datetime(dt2, utc=True)
    grouped_df = df[[case_id_glue, timestamp_key]].groupby(df[case_id_glue])
    first = grouped_df.first()
    last = grouped_df.last()
    last.columns = [str(col) + '_2' for col in last.columns]
    stacked = pd.concat([first, last], axis=1)
    stacked1 = stacked[stacked[timestamp_key] > dt1]
    stacked1 = stacked1[stacked1[timestamp_key] < dt2]
    stacked2 = stacked[stacked[timestamp_key + "_2"] > dt1]
    stacked2 = stacked2[stacked2[timestamp_key + "_2"] < dt2]
    stacked3 = stacked[stacked[timestamp_key] < dt1]
    stacked3 = stacked3[stacked3[timestamp_key + "_2"] > dt2]
    stacked = pd.concat([stacked1, stacked2, stacked3], axis=0)
    i1 = df.set_index(case_id_glue).index
    i2 = stacked.set_index(case_id_glue).index
    return df[i1.isin(i2)]
示例#3
0
def filter_traces_contained(df, dt1, dt2, parameters=None):
    """
    Get traces that are contained in the given interval

    Parameters
    ----------
    df
        Pandas dataframe
    dt1
        Lower bound to the interval (possibly expressed as string, but automatically converted)
    dt2
        Upper bound to the interval (possibly expressed as string, but automatically converted)
    parameters
        Possible parameters of the algorithm, including:
            timestamp_key -> Attribute to use as timestamp
            case_id_glue -> Column that contains the timestamp

    Returns
    ----------
    df
        Filtered dataframe
    """
    if parameters is None:
        parameters = {}
    timestamp_key = parameters[
        PARAMETER_CONSTANT_TIMESTAMP_KEY] if PARAMETER_CONSTANT_TIMESTAMP_KEY in parameters else DEFAULT_TIMESTAMP_KEY
    case_id_glue = parameters[
        PARAMETER_CONSTANT_CASEID_KEY] if PARAMETER_CONSTANT_CASEID_KEY in parameters else CASE_CONCEPT_NAME
    dt1 = get_dt_from_string(dt1)
    dt2 = get_dt_from_string(dt2)
    needs_conversion = check_pandas_ge_024()
    if needs_conversion:
        dt1 = dt1.replace(tzinfo=pytz.utc)
        dt2 = dt2.replace(tzinfo=pytz.utc)
        dt1 = pd.to_datetime(dt1, utc=True)
        dt2 = pd.to_datetime(dt2, utc=True)
    grouped_df = df[[case_id_glue, timestamp_key]].groupby(df[case_id_glue])
    first = grouped_df.first()
    last = grouped_df.last()
    last.columns = [str(col) + '_2' for col in last.columns]
    stacked = pd.concat([first, last], axis=1)
    stacked = stacked[stacked[timestamp_key] > dt1]
    stacked = stacked[stacked[timestamp_key + "_2"] < dt2]
    i1 = df.set_index(case_id_glue).index
    i2 = stacked.set_index(case_id_glue).index
    return df[i1.isin(i2)]
def convert_timestamp_columns_in_df(df,
                                    timest_format=None,
                                    timest_columns=None):
    """
    Convert all dataframe columns in a dataframe

    Parameters
    -----------
    df
        Dataframe
    timest_format
        (If provided) Format of the timestamp columns in the CSV file
    timest_columns
        Columns of the CSV that shall be converted into timestamp

    Returns
    ------------
    df
        Dataframe with timestamp columns converted

    """
    needs_conversion = check_pandas_ge_024()
    for col in df.columns:
        if timest_columns is None or col in timest_columns:
            if df[col].dtype == 'object':
                try:
                    if timest_format is None:
                        if needs_conversion:
                            df[col] = pd.to_datetime(df[col], utc=True)
                        else:
                            df[col] = pd.to_datetime(df[col])
                    else:
                        if needs_conversion:
                            df[col] = pd.to_datetime(df[col],
                                                     utc=True,
                                                     format=timest_format)
                        else:
                            df[col] = pd.to_datetime(df[col])
                except ValueError:
                    # print("exception converting column: "+str(col))
                    pass
    return df
示例#5
0
def apply_events(df, dt1, dt2, parameters=None):
    """
    Get a new log containing all the events contained in the given interval

    Parameters
    ----------
    df
        Pandas dataframe
    dt1
        Lower bound to the interval (possibly expressed as string, but automatically converted)
    dt2
        Upper bound to the interval (possibly expressed as string, but automatically converted)
    parameters
        Possible parameters of the algorithm, including:
            Parameters.TIMESTAMP_KEY -> Attribute to use as timestamp

    Returns
    ----------
    df
        Filtered dataframe
    """
    if parameters is None:
        parameters = {}

    timestamp_key = exec_utils.get_param_value(Parameters.TIMESTAMP_KEY,
                                               parameters,
                                               DEFAULT_TIMESTAMP_KEY)
    dt1 = get_dt_from_string(dt1)
    dt2 = get_dt_from_string(dt2)
    needs_conversion = check_pandas_ge_024()
    if needs_conversion:
        dt1 = dt1.replace(tzinfo=pytz.utc)
        dt2 = dt2.replace(tzinfo=pytz.utc)
        dt1 = pd.to_datetime(dt1, utc=True)
        dt2 = pd.to_datetime(dt2, utc=True)

    df = df[df[timestamp_key] > dt1]
    df = df[df[timestamp_key] < dt2]

    return df