def stringify(df, codes, cols, pid='pid', start_time='in_date', replace=None, end_time=None, sep=None, new_sep=None, single_value_columns=None, out='series'): codes = _listify(codes) cols = _listify(cols) single_cols = infer_single_value_columns(df=df, cols=cols, sep=sep, check_all=True) multiple_cols = [set(cols) - set(single_cols)] expanded_codes = expand_codes(df=df, cols=cols, codes=codes, sep=sep) # use expanded codes?, not codes as argument. # why? because a code may be in a compount col and not in single cols if single_cols: single_events = stringify_singles(df=df, codes=expanded_codes, cols=single_cols, pid=pid, start_time=start_time, replace=replace, end_time=end_time, out='df') all_events = single_events if multiple_cols: multiple_events = stringify_multiples(df=df, codes=expanded_codes, cols=multiple_cols, pid=pid, start_time=start_time, replace=replace, end_time=end_time, out='df') all_events = multiple_events if single_cols and multiple_cols: all_events = pd.concat([multiple_events, single_events]) if out == 'series': events_by_id = all_events.sort_values([pid, start_time]).groupby(pid)['events'].sum() return events_by_id elif out == 'df': return all_events
def get_ids(df, codes, cols, groups, pid='pid', out=None, sep=None): codes = _listify(codes) groupby = _listify(groupby) codes = expand_codes(df=df, codes=codes, cols=cols, sep=sep) rows_with_codes = get_rows(df=df, codes=codes, cols=cols, sep=sep) # grouped_ids = df[rows_with_codes].groupby([pid, groups]).count() grouped_ids = df[rows_with_codes].groupby(groups)[pid].unique() grouped_ids = grouped_ids.apply(set) return grouped_ids
def find_changes(df, codes=None, cols=None, sep=None, groups=None, interact=False, _fix=True, threshold=3): """ Identifies large increases or decreases in use of given codes in the specified groups rename? more like an increase identifier than a spike ideintifier as it is spikes implies relatively low before and after comparet to the "spike" rem: spikes can also be groups of years (spike in two years, then down again) """ sub = df groups = _listify(groups) if _fix: df, cols = _to_df(df, cols) codes, cols, allcodes, sep = _fix_args(df=df, codes=codes, cols=cols, sep=sep, merge=False, group=False) rows = get_rows(df=df, codes=allcodes, cols=cols, sep=sep, _fix=False) sub = df[rows] all_groups = {} for group in groups: counted = [] names = [] for name, codelist in codes.items(): count = sub.groupby(group).apply(count_codes, codes={name: codelist}, cols=cols, sep=sep, dropna=True, _fix=False) counted.append(count) # names.append(name) counted = pd.concat(counted, axis=1) # counted.columns=names if threshold: counted_delta = counted.pct_change() / counted.pct_change().abs( ).mean() counted_delta = counted_delta[counted_delta > threshold] counted = counted_delta all_groups[group] = counted return all_groups
def _get_mask(df, codes, cols, sep=None): """ A dataframe of one column for each code with true and false depending on whether the row has the code(s) Args: df (dataframe): dataframe codes (str or list of str): the desired codes cols (str or list of str: name of column(s) with the codes sep (str): separator between codes in a cell Returns: dataframe """ codes = _listify(codes) cols = _listify(cols) cols = _expand_cols(df=df, cols=cols) expanded_codes = expand_codes(df=df, codes=codes, cols=cols, sep=sep) # if compound words in a cell if sep: expanded_codes_regex = '|'.join(expanded_codes) b = pd.DataFrame() for col in cols: b[col] = df[col].str.contains(expanded_codes_regex, na=False).values # if single value cells only else: b = df[cols].isin(expanded_codes) return b
def expand_replace(df, replace, cols, sep=None, strip=True): """ Takes a dictionary of shorthand codes and replacements, and returns a dictionary with all the codes expanded Example: expand_replace(df=df, replace={'AB04*':'b'}, col='atc') May return {'AB04a':'b', 'AB04b': 'b', 'AB04c':'b'} """ # may use regex instead, but this may also be slower to use later? cols = _listify(cols) codes = list(replace.keys()) codes = expand_codes(df=df, codes=codes, cols=cols, sep=None) unexpanded = {code: text for code, text in replace.items() if '*' in code} for starcode, text in unexpanded.items(): startswith, endswith = starcode.split('*') # starting_codes = ending_codes = start_and_end_codes = {} starting_codes = {} ending_codes = {} start_and_end_codes = {} # may be unnecessary to do this (and it may link the dictionaries in unwanted ways?) if startswith: starting_codes = {code: text for code in codes if code.startswith(startswith)} if endswith: ending_codes = {code: text for code in codes if code.endswith(endswith)} if startswith and endswith: start_and_end_codes = {starting_codes: starting_codes[x] for x in starting_codes if x in ending_codes} replace.update({**starting_codes, **ending_codes, **start_and_end_codes}) del replace[starcode] return replace
def find_spikes(df, codes=None, cols=None, persons=False, pid='pid', sep=None, groups=None, each_group=False, _fix=True, threshold=3, divide_by='pid'): """ Identifies large increases or decreases in use of given codes in the specified groups rename? more like an increase identifier than a spike ideintifier as it is spikes implies relatively low before and after comparet to the "spike" rem: spikes can also be groups of years (spike in two years, then down again) cols='ncmp' df=npr.copy() codes='4AB04' sep=',' pid='pid' groups='region' threshold=3 divide_by='pid' """ sub = df groups = _listify(groups) if _fix: sub, cols = _to_df(sub, cols) codes, cols, allcodes, sep = _fix_args(df=sub, codes=codes, cols=cols, sep=sep, merge=False, group=False) rows = get_rows(df=df, codes=allcodes, cols=cols, sep=sep, _fix=False) sub = sub[rows] if persons: counted = sub.groupby(groups).count_persons(codes=codes, cols=cols, sep=sep, _fix=False) else: counted = sub.groupby(groups).apply(count_codes, codes=codes, cols=cols, sep=sep) if divide_by: divisor = sub.groupby(groups)[divide_by].nunique() counted = counted / divisor avg = counted.mean() sd = counted.std() counted.plot.bar() deviations = (counted - avg) / sd deviations = (counted / avg) / avg spikes = counted(deviations.abs() > threshold) return spikes