Пример #1
0
def stringify(df,
              codes,
              cols,
              pid='pid',
              start_time='in_date',
              replace=None,
              end_time=None,
              sep=None,
              new_sep=None,
              single_value_columns=None,
              out='series'):
    codes = _listify(codes)
    cols = _listify(cols)

    single_cols = infer_single_value_columns(df=df,
                                             cols=cols,
                                             sep=sep,
                                             check_all=True)

    multiple_cols = [set(cols) - set(single_cols)]

    expanded_codes = expand_codes(df=df, cols=cols, codes=codes, sep=sep)
    # use expanded codes?, not codes as argument.
    # why? because a code may be in a compount col and not in single cols

    if single_cols:
        single_events = stringify_singles(df=df,
                                          codes=expanded_codes,
                                          cols=single_cols,
                                          pid=pid,
                                          start_time=start_time,
                                          replace=replace,
                                          end_time=end_time,
                                          out='df')
        all_events = single_events

    if multiple_cols:
        multiple_events = stringify_multiples(df=df,
                                              codes=expanded_codes,
                                              cols=multiple_cols,
                                              pid=pid,
                                              start_time=start_time,
                                              replace=replace,
                                              end_time=end_time,
                                              out='df')
        all_events = multiple_events

    if single_cols and multiple_cols:
        all_events = pd.concat([multiple_events, single_events])

    if out == 'series':
        events_by_id = all_events.sort_values([pid, start_time]).groupby(pid)['events'].sum()
        return events_by_id
    elif out == 'df':
        return all_events
Пример #2
0
def get_ids(df, codes, cols, groups, pid='pid', out=None, sep=None):
    codes = _listify(codes)
    groupby = _listify(groupby)

    codes = expand_codes(df=df, codes=codes, cols=cols, sep=sep)

    rows_with_codes = get_rows(df=df, codes=codes, cols=cols, sep=sep)
    # grouped_ids = df[rows_with_codes].groupby([pid, groups]).count()
    grouped_ids = df[rows_with_codes].groupby(groups)[pid].unique()
    grouped_ids = grouped_ids.apply(set)

    return grouped_ids
Пример #3
0
def find_changes(df,
                 codes=None,
                 cols=None,
                 sep=None,
                 groups=None,
                 interact=False,
                 _fix=True,
                 threshold=3):
    """
    Identifies large increases or decreases in use of given codes in the specified groups
    rename? more like an increase identifier than a spike ideintifier as it is
    spikes implies relatively low before and after comparet to the "spike"
    rem: spikes can also be groups of years (spike in two years, then down again)

    """
    sub = df
    groups = _listify(groups)

    if _fix:
        df, cols = _to_df(df, cols)
        codes, cols, allcodes, sep = _fix_args(df=df,
                                               codes=codes,
                                               cols=cols,
                                               sep=sep,
                                               merge=False,
                                               group=False)
        rows = get_rows(df=df, codes=allcodes, cols=cols, sep=sep, _fix=False)
        sub = df[rows]

    all_groups = {}

    for group in groups:
        counted = []
        names = []
        for name, codelist in codes.items():
            count = sub.groupby(group).apply(count_codes,
                                             codes={name: codelist},
                                             cols=cols,
                                             sep=sep,
                                             dropna=True,
                                             _fix=False)
            counted.append(count)
            # names.append(name)

        counted = pd.concat(counted, axis=1)
        # counted.columns=names

        if threshold:
            counted_delta = counted.pct_change() / counted.pct_change().abs(
            ).mean()
            counted_delta = counted_delta[counted_delta > threshold]
            counted = counted_delta

        all_groups[group] = counted

    return all_groups
Пример #4
0
def _get_mask(df,
              codes,
              cols,
              sep=None):
    """
    A dataframe of one column for each code with true and false depending on whether the row has the code(s)

    Args:

        df (dataframe): dataframe
        codes (str or list of str): the desired codes
        cols (str or list of str: name of column(s) with the codes
        sep (str): separator between codes in a cell

    Returns:

        dataframe

    """
    codes = _listify(codes)
    cols = _listify(cols)

    cols = _expand_cols(df=df, cols=cols)

    expanded_codes = expand_codes(df=df,
                                  codes=codes,
                                  cols=cols,
                                  sep=sep)

    # if compound words in a cell
    if sep:
        expanded_codes_regex = '|'.join(expanded_codes)
        b = pd.DataFrame()
        for col in cols:
            b[col] = df[col].str.contains(expanded_codes_regex, na=False).values

    # if single value cells only
    else:
        b = df[cols].isin(expanded_codes)

    return b
Пример #5
0
def expand_replace(df, replace, cols, sep=None, strip=True):
    """
    Takes a dictionary of shorthand codes and replacements, and returns a dictionary with all the codes expanded

    Example:
        expand_replace(df=df, replace={'AB04*':'b'}, col='atc')

        May return
            {'AB04a':'b', 'AB04b': 'b', 'AB04c':'b'}

    """
    # may use regex instead, but this may also be slower to use later?
    cols = _listify(cols)
    codes = list(replace.keys())

    codes = expand_codes(df=df, codes=codes, cols=cols, sep=None)

    unexpanded = {code: text for code, text in replace.items() if '*' in code}

    for starcode, text in unexpanded.items():

        startswith, endswith = starcode.split('*')

        # starting_codes  = ending_codes = start_and_end_codes = {}
        starting_codes = {}
        ending_codes = {}
        start_and_end_codes = {}
        # may be unnecessary to do this (and it may link the dictionaries in unwanted ways?)

        if startswith:
            starting_codes = {code: text for code in codes if code.startswith(startswith)}
        if endswith:
            ending_codes = {code: text for code in codes if code.endswith(endswith)}
        if startswith and endswith:
            start_and_end_codes = {starting_codes: starting_codes[x] for x in starting_codes if x in ending_codes}

        replace.update({**starting_codes, **ending_codes, **start_and_end_codes})

        del replace[starcode]
    return replace
Пример #6
0
def find_spikes(df,
                codes=None,
                cols=None,
                persons=False,
                pid='pid',
                sep=None,
                groups=None,
                each_group=False,
                _fix=True,
                threshold=3,
                divide_by='pid'):
    """
    Identifies large increases or decreases in use of given codes in the specified groups
    rename? more like an increase identifier than a spike ideintifier as it is
    spikes implies relatively low before and after comparet to the "spike"
    rem: spikes can also be groups of years (spike in two years, then down again)

    cols='ncmp'
    df=npr.copy()
    codes='4AB04'
    sep=','
    pid='pid'
    groups='region'
    threshold=3
    divide_by='pid'
    """
    sub = df
    groups = _listify(groups)

    if _fix:
        sub, cols = _to_df(sub, cols)
        codes, cols, allcodes, sep = _fix_args(df=sub,
                                               codes=codes,
                                               cols=cols,
                                               sep=sep,
                                               merge=False,
                                               group=False)
        rows = get_rows(df=df, codes=allcodes, cols=cols, sep=sep, _fix=False)
        sub = sub[rows]

    if persons:
        counted = sub.groupby(groups).count_persons(codes=codes,
                                                    cols=cols,
                                                    sep=sep,
                                                    _fix=False)
    else:
        counted = sub.groupby(groups).apply(count_codes,
                                            codes=codes,
                                            cols=cols,
                                            sep=sep)

    if divide_by:
        divisor = sub.groupby(groups)[divide_by].nunique()
        counted = counted / divisor

    avg = counted.mean()
    sd = counted.std()
    counted.plot.bar()
    deviations = (counted - avg) / sd
    deviations = (counted / avg) / avg
    spikes = counted(deviations.abs() > threshold)

    return spikes