示例#1
0
def filter_traces_intersecting(df, dt1, dt2, parameters=None):
    """
    Filter traces intersecting the given interval

    Parameters
    ----------
    df
        Pandas dataframe
    dt1
        Lower bound to the interval (possibly expressed as string, but automatically converted)
    dt2
        Upper bound to the interval (possibly expressed as string, but automatically converted)
    parameters
        Possible parameters of the algorithm, including:
            Parameters.TIMESTAMP_KEY -> Attribute to use as timestamp
            Parameters.CASE_ID_KEY -> Column that contains the timestamp

    Returns
    ----------
    df
        Filtered dataframe
    """
    if parameters is None:
        parameters = {}
    timestamp_key = exec_utils.get_param_value(Parameters.TIMESTAMP_KEY,
                                               parameters,
                                               DEFAULT_TIMESTAMP_KEY)
    case_id_glue = exec_utils.get_param_value(Parameters.CASE_ID_KEY,
                                              parameters, CASE_CONCEPT_NAME)
    dt1 = get_dt_from_string(dt1)
    dt2 = get_dt_from_string(dt2)
    needs_conversion = check_pandas_ge_024()
    if needs_conversion:
        dt1 = dt1.replace(tzinfo=pytz.utc)
        dt2 = dt2.replace(tzinfo=pytz.utc)
        dt1 = pd.to_datetime(dt1, utc=True)
        dt2 = pd.to_datetime(dt2, utc=True)
    grouped_df = df[[case_id_glue, timestamp_key]].groupby(df[case_id_glue])
    first = grouped_df.first()
    last = grouped_df.last()
    last.columns = [str(col) + '_2' for col in last.columns]
    stacked = pd.concat([first, last], axis=1)
    stacked1 = stacked[stacked[timestamp_key] > dt1]
    stacked1 = stacked1[stacked1[timestamp_key] < dt2]
    stacked2 = stacked[stacked[timestamp_key + "_2"] > dt1]
    stacked2 = stacked2[stacked2[timestamp_key + "_2"] < dt2]
    stacked3 = stacked[stacked[timestamp_key] < dt1]
    stacked3 = stacked3[stacked3[timestamp_key + "_2"] > dt2]
    stacked = pd.concat([stacked1, stacked2, stacked3], axis=0)
    i1 = df.set_index(case_id_glue).index
    i2 = stacked.set_index(case_id_glue).index
    return df[i1.isin(i2)]
示例#2
0
def convert_timestamp_columns_in_df(df,
                                    timest_format=None,
                                    timest_columns=None):
    """
    Convert all dataframe columns in a dataframe

    Parameters
    -----------
    df
        Dataframe
    timest_format
        (If provided) Format of the timestamp columns in the CSV file
    timest_columns
        Columns of the CSV that shall be converted into timestamp

    Returns
    ------------
    df
        Dataframe with timestamp columns converted

    """
    needs_conversion = check_pandas_ge_024()
    for col in df.columns:
        if timest_columns is None or col in timest_columns:
            if df[col].dtype == 'object':
                try:
                    if timest_format is None:
                        if needs_conversion:
                            df[col] = pd.to_datetime(df[col], utc=True)
                        else:
                            df[col] = pd.to_datetime(df[col])
                    else:
                        if needs_conversion:
                            df[col] = pd.to_datetime(df[col],
                                                     utc=True,
                                                     format=timest_format)
                        else:
                            df[col] = pd.to_datetime(df[col])
                except ValueError:
                    # print("exception converting column: "+str(col))
                    pass
    return df
示例#3
0
def apply_events(df, dt1, dt2, parameters=None):
    """
    Get a new log containing all the events contained in the given interval

    Parameters
    ----------
    df
        Pandas dataframe
    dt1
        Lower bound to the interval (possibly expressed as string, but automatically converted)
    dt2
        Upper bound to the interval (possibly expressed as string, but automatically converted)
    parameters
        Possible parameters of the algorithm, including:
            Parameters.TIMESTAMP_KEY -> Attribute to use as timestamp

    Returns
    ----------
    df
        Filtered dataframe
    """
    if parameters is None:
        parameters = {}

    timestamp_key = exec_utils.get_param_value(Parameters.TIMESTAMP_KEY,
                                               parameters,
                                               DEFAULT_TIMESTAMP_KEY)
    dt1 = get_dt_from_string(dt1)
    dt2 = get_dt_from_string(dt2)
    needs_conversion = check_pandas_ge_024()
    if needs_conversion:
        dt1 = dt1.replace(tzinfo=pytz.utc)
        dt2 = dt2.replace(tzinfo=pytz.utc)
        dt1 = pd.to_datetime(dt1, utc=True)
        dt2 = pd.to_datetime(dt2, utc=True)

    df = df[df[timestamp_key] > dt1]
    df = df[df[timestamp_key] < dt2]

    return df
示例#4
0
def get_dt_from_string(dt: Union[datetime, str]) -> datetime:
    """
    If the date is expressed as string, do the conversion to a datetime.datetime object

    Parameters
    -----------
    dt
        Date (string or datetime.datetime)

    Returns
    -----------
    dt
        Datetime object
    """
    if type(dt) is str:
        dt = datetime.strptime(dt, "%Y-%m-%d %H:%M:%S")

    needs_conversion = check_pandas_ge_024()
    if needs_conversion:
        dt = dt.replace(tzinfo=pytz.utc)
        dt = pd.to_datetime(dt, utc=True)

    return dt