def max_aggregate(df, uid, col): """Aggregates pandas dataframe by grouping by uid and taking maximum (non-NA) observation in col for each group. uids with only NA values in col are not returned in aggregated dataframe. Parameters ---------- df : pandas DataFrame uid : str Name of unique ID column col : str Name of column to be aggregated Returns ------- ma_df : pandas DataFrame Examples -------- """ df = df[[uid, col]].drop_duplicates() df.dropna(axis=0, how='any', inplace=True) kd_df = keep_duplicates(df, uid) if kd_df.empty: ma_df = df else: df = remove_duplicates(df, uid) groups = kd_df.groupby(uid, as_index=False) groups = groups.agg(np.nanmax) ma_df = df.append(groups).reset_index(drop=True) return ma_df
def test_keep_duplicates(): '''test keep_duplicates''' input_df = pd.DataFrame( {'A': [1, 1, 1, 1], 'B': [2, 2, 3, 4], 'C': [1, 3, 1, 3]}) orig_input_df = copy.deepcopy(input_df) output_df = pd.DataFrame( {'A': [1, 1], 'B': [2, 2], 'C': [1, 3]}, index=[0, 1]) results = general_utils.keep_duplicates(input_df, ['A', 'B']) assert results.equals(output_df) assert orig_input_df.equals(input_df)
def mode_aggregate(df, uid, col): """Aggregates pandas dataframe by grouping by uid and taking most common (non-NA) observation in col for each group. uids with only NA values in col are not returned in aggregated dataframe. If there are multiple mode values for a group the first (last observed) is returned. Parameters ---------- df : pandas DataFrame uid : str Name of unique ID column col : str Name of column to be aggregated Returns ------- ma_df : pandas DataFrame Examples -------- """ warnings.filterwarnings("ignore") # Ignore nan filtering warning df = df[[uid, col]] df.dropna(axis=0, how='any', inplace=True) kd_df = keep_duplicates(df, uid) if kd_df.empty: ma_df = df else: df = remove_duplicates(df, uid) groups = kd_df.groupby(uid, as_index=False) groups = groups.aggregate( lambda x: stats.mode(x, nan_policy='omit').mode[0]) groups = pd.DataFrame(groups, columns=[uid, col]) ma_df = df.append(groups).reset_index(drop=True) return ma_df
def assign_unique_ids(df, uid, id_cols, conflict_cols=None, log=None, unresolved_policy='distinct'): """Assigns unique IDs (uid) to dataframe based on id_cols groupings. If conflict_cols are specified, conflicts will be resolved to determine if id_cols groupings with differing conflict_cols information are actually distinct, unresolved conflicts can be handled as 'distinct', 'same', or 'manual'. Parameters ---------- df : pandas DataFrame uid : str Name of unique ID column id_cols : list List of column names used for grouping conflict_cols : list List of column names used for conflict evaluation log : logger If given, uid_report will be generated and logged as info unresolved_policy: str Determine how to handle unresolved conflicts 'distinct' = each unresolved in a group is distinct, 'same' = each unresolved in a group is the same, 'manual' = send unresolved groups to manual_resolve() Returns ------- out_df : pandas DataFrame """ if conflict_cols is None: conflict_cols = [] dfu = df[id_cols + conflict_cols].drop_duplicates() dfu.reset_index(drop=True, inplace=True) full_row_count = df.shape[0] unique_rows = dfu.shape[0] conflict_rows = 0 conflicts_resolved = 0 if conflict_cols: rd_df = remove_duplicates(dfu, id_cols).reset_index(drop=True) rd_df.insert(0, uid, rd_df.index + 1) kd_df = keep_duplicates(dfu, id_cols).reset_index(drop=True) kd_df[id_cols] = kd_df[id_cols].fillna(value=-9999) conflict_rows = kd_df.shape[0] next_uid = 1 if rd_df[uid].dropna().empty else rd_df[uid].max() + 1 rc_df = resolve_conflicts(kd_df, id_cols, conflict_cols, uid, next_uid) if log: log.info( '%d resolved conflicts. %d unresolved conflicts' % (rc_df[uid].count(), rc_df[uid].size - rc_df[uid].count())) if not rc_df[uid].dropna().empty: next_uid = rc_df[uid].max() + 1 if rc_df[uid].isnull().sum() > 0: if unresolved_policy == 'distinct': rc_df.loc[rc_df[uid].isnull(), uid] = \ np.arange(next_uid, next_uid + rc_df[uid].isnull().sum()) elif unresolved_policy == 'same': sdf = pd.DataFrame() for k, g in rc_df[rc_df[uid].isnull()].groupby(id_cols): g[uid] = next_uid next_uid += 1 sdf = sdf.append(g) rc_df = rc_df.dropna(subset=[uid]).append(sdf) elif unresolved_policy == 'manual': mr_df = pd.DataFrame() for k,g in rc_df\ .loc[rc_df[uid].isnull(), id_cols + conflict_cols]\ .groupby(id_cols, as_index=False): g = manual_resolve(g, uid, next_uid) next_uid = g[uid].max() + 1 mr_df = mr_df.append(g) if log: log.info( 'Unresolved conflicts resolved by "%s" into %d ids' % (unresolved_policy, mr_df[uid].nunique())) rc_df = rc_df.dropna(subset=[uid]).append(mr_df) rc_df[id_cols] = rc_df[id_cols].replace({-9999: np.nan}) if rc_df.shape[0] == 0: conflicts_resolved = 0 else: conflicts_resolved = rc_df[uid].nunique() df = df.merge(rd_df.append(rc_df), on=id_cols + conflict_cols, how='left') else: dfu[uid] = dfu.index + 1 df = df.merge(dfu, on=id_cols, how='left') assert keep_duplicates(df[[uid] + id_cols].drop_duplicates(), uid).empty,\ 'This should not happen. Same uids between id_col groupings.' assert df[df[uid].isnull()].shape[0] == 0,\ 'Some unique IDs are null:\n%s' % df[df[uid].isnull()] assert max(df[uid]) == df[uid].nunique(),\ 'Unique IDs are not correctly scaled' uid_count = max(df[uid]) uid_report = generate_uid_report(full_row_count, unique_rows, conflict_rows, conflicts_resolved, uid_count) if log: log.info(uid_report) else: print(uid_report) return df
def loop_merge(self, custom_merges=[], verbose=True, one_to_one=True, base_OD_edits=OrderedDict()): """Performs iterative pairwise joins to produce dataframe of merges Loops over on_lists to iteratively merge ref_um and sup_um, continually removing unique ids in sup_um (and ref_um if one_to_one=True) that were successfully merged. Parameters ---------- custom_merges : list (of lists or dicts) List of user entered custom merge lists verbose : bool If True successful merges are printed one_to_one : bool If True assumes sup_um is successfully deduplicated and uids are dropped from ref_um after successful merges If False uids are not dropped from ref_um after successful merges base_OD_edits : OrderedDict Ordereddict of alterations to base_OD (preserving initial order) Returns ---------- self """ intersect_cols = list_diff( list_intersect(self.ref_um.columns, self.sup_um.columns), [self.uid, self.sup_id]) self.ref_um = self.ref_um[[self.uid] + intersect_cols].drop_duplicates() self.sup_um = self.sup_um[[self.sup_id] + intersect_cols].drop_duplicates() ref_ids = self.ref_um[self.uid].nunique() sup_ids = self.sup_um[self.sup_id].nunique() multi = self.sup_um[self.sup_id].size > sup_ids self.on_lists = self.generate_on_lists(intersect_cols, custom_merges, base_OD_edits) self.id_cols = [self.uid, self.sup_id] self.merged_df = pd.DataFrame(columns=self.id_cols + ['matched_on']) self.log.info('Beginning loop_merge.') for merge_cols in self.on_lists: assert len(merge_cols) > 0 reft = self.ref_um supt = self.sup_um if isinstance(merge_cols, dict): reft = reft.query(merge_cols['query']) supt = supt.query(merge_cols['query']) merge_cols = merge_cols['cols'] reft = remove_duplicates( reft[[self.uid] + merge_cols].dropna(how='any'), merge_cols, True) if one_to_one: supt = remove_duplicates( supt[[self.sup_id] + merge_cols].dropna(how='any'), merge_cols, True) else: supt = supt[[self.sup_id] + merge_cols].dropna(how='any') mergedt = reft.merge(supt, on=merge_cols, how='inner')[self.id_cols] if multi: mergedt = mergedt.drop_duplicates() if mergedt.shape[0] > 0: if verbose: print('%d Matches on \n %s columns' % (mergedt.shape[0], merge_cols)) mergedt['matched_on'] = '-'.join(merge_cols) self.merged_df = self.merged_df\ .append(mergedt[self.id_cols + ['matched_on']])\ .reset_index(drop=True) if one_to_one: self.ref_um = self.ref_um.loc[ ~self.ref_um[self.uid].isin(self.merged_df[self.uid])] self.sup_um = self.sup_um.loc[~self.sup_um[self.sup_id].isin( self.merged_df[self.sup_id])] self.merged_df.reset_index(drop=True, inplace=True) self.log_merge_report(self.merged_df.shape[0], ref_ids, sup_ids) self.log.info('\n%s', self.merged_df['matched_on'].value_counts()) if one_to_one: kds = keep_duplicates(self.merged_df, [self.uid]) assert kds.empty,\ print('Same UID matched to multiple sup_ids %s' '\n Ref: %s \n Sup: %s' % (kds, self.ref_df[ self.ref_df[self.uid].isin(kds[self.uid]) ].sort_values(self.uid), self.sup_df[ self.sup_df[self.sup_id].isin(kds[self.sup_id]) ].sort_values(self.sup_id))) kds = keep_duplicates(self.merged_df, [self.sup_id]) assert kds.empty,\ print('Same sup_id matched to multiple UIDs %s\nRef: %s\nSup: %s' % (kds, self.ref_df[ self.ref_df[self.uid].isin(kds[self.uid]) ].sort_values(self.uid), self.sup_df[ self.sup_df[self.sup_id].isin(kds[self.sup_id]) ].sort_values(self.sup_id))) return self
df_FFN.loc[df_FFN['penalty_code'].notnull() & df_FFN['final_outcome_USE'].isnull(), 'final_outcome_USE'] = \ df_FFN.loc[df_FFN['penalty_code'].notnull() & df_FFN['final_outcome_USE'].isnull(), 'penalty_code'].str.title() df_FFN.loc[df_FFN.final_outcome_USE.isnull() & (df_FFN['final_finding'] == 'SUSTAINED') & (df_FFN['final_finding_narrative'].str.contains('No Penalty')), 'final_outcome_USE'] = 'No Action Taken' df_FFN.loc[df_FFN.final_outcome_USE.isnull() & (df_FFN['final_finding'] == 'SUSTAINED') & (df_FFN['final_finding_narrative'].str.contains('RESIGNED')), 'final_outcome_USE'] = 'Resigned' df_NFF = df.loc[~df.accusation_id.isin(df_FFN.accusation_id), ['accusation_id', 'cr_id', 'number_of_days', 'final_finding' ]].drop_duplicates() assert keep_duplicates( df_NFF.drop('number_of_days', axis=1).drop_duplicates(), ['accusation_id', 'cr_id']).empty df_NFF = df_NFF\ .groupby(['accusation_id', 'cr_id', 'final_finding'], as_index=False)\ .max() df_NFF['final_finding_USE'] = df_NFF['final_finding'] loc_inds = df_NFF['number_of_days'].notnull().astype(str).isin( outcome_dict.keys()) df_NFF['final_outcome_USE'] = df_NFF['number_of_days'].map( lambda x: np.nan if pd.isnull(x) else str(int(x)) + ' Day Suspension') df_NFF.loc[loc_inds, 'final_outcome_USE'] = df_NFF.loc[ loc_inds, 'number_of_days'].astype(str).replace(outcome_dict) df_FO = df_NFF.append(df_FFN)[[ 'accusation_id', 'cr_id', 'final_finding_USE', 'final_outcome_USE' ]]
return d log.info('Assembling 1967 - 1999 complaints') c1 = pd.read_csv(cons.input_cmpl1_file) c1crs = set(c1.cr_id) c1 = c1[['cr_id', 'incident_date', 'complaint_date', 'closed_date', 'incident_location']]\ .rename(columns={'incident_location': 'address'})\ .drop_duplicates() c1[cons.date_cols] = c1[cons.date_cols].apply(pd.to_datetime) c1['address'] = c1['address'].fillna('').astype(str) c1 = \ keep_duplicates(c1, 'cr_id')\ .groupby('cr_id', as_index=False)\ .agg({ 'incident_date' : min, 'complaint_date' : min, 'closed_date' : min, 'address' : lambda x: sorted(x, key=len)[-1] })\ .append(remove_duplicates(c1, 'cr_id')) assert c1.shape[0] == c1['cr_id'].nunique() assert set(c1.cr_id) == c1crs assert c1[c1.cr_id.isnull()].empty log.info('Assembling 2000 - 2016 complaints') c2 = pd.read_csv(cons.input_cmpl2_file) c2crs = set(c2.cr_id) c2_core = c2\ .drop(['row_id'] + cons.date_cols, axis=1)\ .drop_duplicates() c2_core = keep_duplicates(c2_core, 'cr_id')\
def combine_histories(uh_list, resignation_df, log, uid='UID', unit='unit', start='unit_start_date', end='unit_end_date', resignation_col='resignation_date'): """Combines multiple unit history dataframes into one containing unique unit movements for individuals, removing non-sensical data and filling missing data Parameters ---------- uh_list : list List of unit history pandas DataFrame resignation_df : pandas DataFrame Contains data on resignation dates log : logging object uid : str Name of unique ID column in unit history and resignation date DataFrames unit : str Name of unit column in unit history DataFrames in uh_list start : str Name of unit start date column in unit history DataFrames in uh_list end : str Name of unit end date column in unit history DataFrames in uh_list resignation_col : str Name of resignation date column in resignation_df DataFrames Returns ------- uh_df : pandas DataFrame """ from assign_unique_ids_functions import aggregate_data uh_df = pd.DataFrame() for df in uh_list: df = df.loc[:, [uid, unit, start, end]] df.dropna(subset=[unit, uid, start], how='any', inplace=True) log.info(('%d rows with non-NA end date and end date ' 'before/equal to start date' ''), df[(df[end].notnull()) & (df[end] <= df[start])].shape[0]) df.loc[(df[end].notnull()) & (df[end] <= df[start]), end] = np.nan uh_df = uh_df.append(df) uh_df.drop_duplicates(inplace=True) uh_df = uh_df.merge(resignation_df, on=uid, how='left') indexes = ((uh_df[resignation_col].notnull()) & (uh_df[end].isnull()) & (uh_df[start] < uh_df[resignation_col])) uh_df.loc[indexes, end] = uh_df.loc[indexes, resignation_col] uh_df.drop(resignation_col, axis=1, inplace=True) uh_rd = remove_duplicates(uh_df, [uid, start, unit]) uh_kd = keep_duplicates(uh_df, [uid, start, unit]) uh_kd = aggregate_data(uh_kd, uid=uid, id_cols=[start, unit], max_cols=[end]) assert uh_rd.shape[0] + uh_kd.shape[0] ==\ uh_df[[uid, unit, start]].drop_duplicates().shape[0],\ 'Data set lost information after split and aggregation.' uh_df = uh_rd.append(uh_kd) uh_df.sort_values([uid, start, unit], inplace=True) uh_df.reset_index(drop=True, inplace=True) return uh_df