예제 #1
0
def max_aggregate(df, uid, col):
    """Aggregates pandas dataframe by grouping by uid
       and taking maximum (non-NA) observation in col for each group.
       uids with only NA values in col are not returned in aggregated dataframe.
    Parameters
    ----------
    df : pandas DataFrame
    uid : str
        Name of unique ID column
    col : str
        Name of column to be aggregated

    Returns
    -------
    ma_df : pandas DataFrame

    Examples
    --------
    """
    df = df[[uid, col]].drop_duplicates()
    df.dropna(axis=0, how='any', inplace=True)
    kd_df = keep_duplicates(df, uid)
    if kd_df.empty:
        ma_df = df
    else:
        df = remove_duplicates(df, uid)
        groups = kd_df.groupby(uid, as_index=False)
        groups = groups.agg(np.nanmax)
        ma_df = df.append(groups).reset_index(drop=True)
    return ma_df
예제 #2
0
def test_keep_duplicates():
    '''test keep_duplicates'''
    input_df = pd.DataFrame(
        {'A': [1, 1, 1, 1],
         'B': [2, 2, 3, 4],
         'C': [1, 3, 1, 3]})
    orig_input_df = copy.deepcopy(input_df)
    output_df = pd.DataFrame(
        {'A': [1, 1],
         'B': [2, 2],
         'C': [1, 3]},
        index=[0, 1])

    results = general_utils.keep_duplicates(input_df,
                                            ['A', 'B'])
    assert results.equals(output_df)
    assert orig_input_df.equals(input_df)
예제 #3
0
def mode_aggregate(df, uid, col):
    """Aggregates pandas dataframe by grouping by uid
       and taking most common (non-NA) observation in col for each group.
       uids with only NA values in col are not returned in
       aggregated dataframe. If there are multiple mode values
       for a group the first (last observed) is returned.

    Parameters
    ----------
    df : pandas DataFrame
    uid : str
        Name of unique ID column
    col : str
        Name of column to be aggregated

    Returns
    -------
    ma_df : pandas DataFrame

    Examples
    --------
    """
    warnings.filterwarnings("ignore")  # Ignore nan filtering warning
    df = df[[uid, col]]
    df.dropna(axis=0, how='any', inplace=True)
    kd_df = keep_duplicates(df, uid)
    if kd_df.empty:
        ma_df = df
    else:
        df = remove_duplicates(df, uid)
        groups = kd_df.groupby(uid, as_index=False)
        groups = groups.aggregate(
            lambda x: stats.mode(x, nan_policy='omit').mode[0])
        groups = pd.DataFrame(groups, columns=[uid, col])
        ma_df = df.append(groups).reset_index(drop=True)
    return ma_df
예제 #4
0
def assign_unique_ids(df,
                      uid,
                      id_cols,
                      conflict_cols=None,
                      log=None,
                      unresolved_policy='distinct'):
    """Assigns unique IDs (uid) to dataframe based on id_cols groupings.
       If conflict_cols are specified, conflicts will be resolved
       to determine if id_cols groupings with differing conflict_cols
       information are actually distinct, unresolved conflicts can be
       handled as 'distinct', 'same', or 'manual'.

    Parameters
    ----------
    df : pandas DataFrame
    uid : str
        Name of unique ID column
    id_cols : list
        List of column names used for grouping
    conflict_cols : list
        List of column names used for conflict evaluation
    log : logger
        If given, uid_report will be generated and logged as info
    unresolved_policy: str
        Determine how to handle unresolved conflicts
        'distinct' = each unresolved in a group is distinct,
        'same' = each unresolved in a group is the same,
        'manual' = send unresolved groups to manual_resolve()

    Returns
    -------
    out_df : pandas DataFrame
    """
    if conflict_cols is None: conflict_cols = []
    dfu = df[id_cols + conflict_cols].drop_duplicates()
    dfu.reset_index(drop=True, inplace=True)

    full_row_count = df.shape[0]
    unique_rows = dfu.shape[0]
    conflict_rows = 0
    conflicts_resolved = 0

    if conflict_cols:
        rd_df = remove_duplicates(dfu, id_cols).reset_index(drop=True)
        rd_df.insert(0, uid, rd_df.index + 1)

        kd_df = keep_duplicates(dfu, id_cols).reset_index(drop=True)
        kd_df[id_cols] = kd_df[id_cols].fillna(value=-9999)

        conflict_rows = kd_df.shape[0]

        next_uid = 1 if rd_df[uid].dropna().empty else rd_df[uid].max() + 1
        rc_df = resolve_conflicts(kd_df, id_cols, conflict_cols, uid, next_uid)

        if log:
            log.info(
                '%d resolved conflicts. %d unresolved conflicts' %
                (rc_df[uid].count(), rc_df[uid].size - rc_df[uid].count()))

        if not rc_df[uid].dropna().empty: next_uid = rc_df[uid].max() + 1
        if rc_df[uid].isnull().sum() > 0:
            if unresolved_policy == 'distinct':
                rc_df.loc[rc_df[uid].isnull(), uid] = \
                    np.arange(next_uid, next_uid + rc_df[uid].isnull().sum())
            elif unresolved_policy == 'same':
                sdf = pd.DataFrame()
                for k, g in rc_df[rc_df[uid].isnull()].groupby(id_cols):
                    g[uid] = next_uid
                    next_uid += 1
                    sdf = sdf.append(g)
                rc_df = rc_df.dropna(subset=[uid]).append(sdf)

            elif unresolved_policy == 'manual':
                mr_df = pd.DataFrame()
                for k,g in rc_df\
                    .loc[rc_df[uid].isnull(), id_cols + conflict_cols]\
                    .groupby(id_cols, as_index=False):
                    g = manual_resolve(g, uid, next_uid)
                    next_uid = g[uid].max() + 1
                    mr_df = mr_df.append(g)
                if log:
                    log.info(
                        'Unresolved conflicts resolved by "%s" into %d ids' %
                        (unresolved_policy, mr_df[uid].nunique()))
                rc_df = rc_df.dropna(subset=[uid]).append(mr_df)

        rc_df[id_cols] = rc_df[id_cols].replace({-9999: np.nan})

        if rc_df.shape[0] == 0:
            conflicts_resolved = 0
        else:
            conflicts_resolved = rc_df[uid].nunique()
        df = df.merge(rd_df.append(rc_df),
                      on=id_cols + conflict_cols,
                      how='left')

    else:
        dfu[uid] = dfu.index + 1
        df = df.merge(dfu, on=id_cols, how='left')

    assert keep_duplicates(df[[uid] + id_cols].drop_duplicates(), uid).empty,\
        'This should not happen. Same uids between id_col groupings.'
    assert df[df[uid].isnull()].shape[0] == 0,\
        'Some unique IDs are null:\n%s' % df[df[uid].isnull()]
    assert max(df[uid]) == df[uid].nunique(),\
        'Unique IDs are not correctly scaled'

    uid_count = max(df[uid])
    uid_report = generate_uid_report(full_row_count, unique_rows,
                                     conflict_rows, conflicts_resolved,
                                     uid_count)
    if log: log.info(uid_report)
    else: print(uid_report)
    return df
예제 #5
0
    def loop_merge(self,
                   custom_merges=[],
                   verbose=True,
                   one_to_one=True,
                   base_OD_edits=OrderedDict()):
        """Performs iterative pairwise joins to produce dataframe of merges

        Loops over on_lists to iteratively merge ref_um and sup_um, continually
        removing unique ids in sup_um (and ref_um if one_to_one=True) that
        were successfully merged.

        Parameters
        ----------
        custom_merges : list (of lists or dicts)
            List of user entered custom merge lists
        verbose : bool
            If True successful merges are printed
        one_to_one : bool
            If True assumes sup_um is successfully deduplicated and uids are
            dropped from ref_um after successful merges
            If False uids are not dropped from ref_um after successful merges
        base_OD_edits : OrderedDict
            Ordereddict of alterations to base_OD (preserving initial order)
        Returns
        ----------
        self
        """
        intersect_cols = list_diff(
            list_intersect(self.ref_um.columns, self.sup_um.columns),
            [self.uid, self.sup_id])
        self.ref_um = self.ref_um[[self.uid] +
                                  intersect_cols].drop_duplicates()
        self.sup_um = self.sup_um[[self.sup_id] +
                                  intersect_cols].drop_duplicates()

        ref_ids = self.ref_um[self.uid].nunique()
        sup_ids = self.sup_um[self.sup_id].nunique()
        multi = self.sup_um[self.sup_id].size > sup_ids

        self.on_lists = self.generate_on_lists(intersect_cols, custom_merges,
                                               base_OD_edits)
        self.id_cols = [self.uid, self.sup_id]
        self.merged_df = pd.DataFrame(columns=self.id_cols + ['matched_on'])
        self.log.info('Beginning loop_merge.')
        for merge_cols in self.on_lists:
            assert len(merge_cols) > 0
            reft = self.ref_um
            supt = self.sup_um
            if isinstance(merge_cols, dict):
                reft = reft.query(merge_cols['query'])
                supt = supt.query(merge_cols['query'])
                merge_cols = merge_cols['cols']
            reft = remove_duplicates(
                reft[[self.uid] + merge_cols].dropna(how='any'), merge_cols,
                True)
            if one_to_one:
                supt = remove_duplicates(
                    supt[[self.sup_id] + merge_cols].dropna(how='any'),
                    merge_cols, True)
            else:
                supt = supt[[self.sup_id] + merge_cols].dropna(how='any')
            mergedt = reft.merge(supt, on=merge_cols,
                                 how='inner')[self.id_cols]
            if multi:
                mergedt = mergedt.drop_duplicates()
            if mergedt.shape[0] > 0:
                if verbose:
                    print('%d Matches on \n %s columns' %
                          (mergedt.shape[0], merge_cols))
                mergedt['matched_on'] = '-'.join(merge_cols)
                self.merged_df = self.merged_df\
                    .append(mergedt[self.id_cols + ['matched_on']])\
                    .reset_index(drop=True)
                if one_to_one:
                    self.ref_um = self.ref_um.loc[
                        ~self.ref_um[self.uid].isin(self.merged_df[self.uid])]
                self.sup_um = self.sup_um.loc[~self.sup_um[self.sup_id].isin(
                    self.merged_df[self.sup_id])]
        self.merged_df.reset_index(drop=True, inplace=True)
        self.log_merge_report(self.merged_df.shape[0], ref_ids, sup_ids)
        self.log.info('\n%s', self.merged_df['matched_on'].value_counts())
        if one_to_one:
            kds = keep_duplicates(self.merged_df, [self.uid])
            assert kds.empty,\
                print('Same UID matched to multiple sup_ids %s'
                      '\n Ref: %s \n Sup: %s'
                      % (kds,
                         self.ref_df[
                            self.ref_df[self.uid].isin(kds[self.uid])
                            ].sort_values(self.uid),
                         self.sup_df[
                            self.sup_df[self.sup_id].isin(kds[self.sup_id])
                            ].sort_values(self.sup_id)))
        kds = keep_duplicates(self.merged_df, [self.sup_id])
        assert kds.empty,\
            print('Same sup_id matched to multiple UIDs %s\nRef: %s\nSup: %s'
                  % (kds,
                     self.ref_df[
                        self.ref_df[self.uid].isin(kds[self.uid])
                        ].sort_values(self.uid),
                     self.sup_df[
                        self.sup_df[self.sup_id].isin(kds[self.sup_id])
                        ].sort_values(self.sup_id)))

        return self
예제 #6
0
df_FFN.loc[df_FFN['penalty_code'].notnull() & df_FFN['final_outcome_USE'].isnull(), 'final_outcome_USE'] = \
    df_FFN.loc[df_FFN['penalty_code'].notnull() & df_FFN['final_outcome_USE'].isnull(), 'penalty_code'].str.title()
df_FFN.loc[df_FFN.final_outcome_USE.isnull() &
           (df_FFN['final_finding'] == 'SUSTAINED') &
           (df_FFN['final_finding_narrative'].str.contains('No Penalty')),
           'final_outcome_USE'] = 'No Action Taken'
df_FFN.loc[df_FFN.final_outcome_USE.isnull() &
           (df_FFN['final_finding'] == 'SUSTAINED') &
           (df_FFN['final_finding_narrative'].str.contains('RESIGNED')),
           'final_outcome_USE'] = 'Resigned'

df_NFF = df.loc[~df.accusation_id.isin(df_FFN.accusation_id),
                ['accusation_id', 'cr_id', 'number_of_days', 'final_finding'
                 ]].drop_duplicates()
assert keep_duplicates(
    df_NFF.drop('number_of_days', axis=1).drop_duplicates(),
    ['accusation_id', 'cr_id']).empty
df_NFF = df_NFF\
    .groupby(['accusation_id', 'cr_id', 'final_finding'], as_index=False)\
    .max()
df_NFF['final_finding_USE'] = df_NFF['final_finding']
loc_inds = df_NFF['number_of_days'].notnull().astype(str).isin(
    outcome_dict.keys())
df_NFF['final_outcome_USE'] = df_NFF['number_of_days'].map(
    lambda x: np.nan if pd.isnull(x) else str(int(x)) + ' Day Suspension')
df_NFF.loc[loc_inds, 'final_outcome_USE'] = df_NFF.loc[
    loc_inds, 'number_of_days'].astype(str).replace(outcome_dict)

df_FO = df_NFF.append(df_FFN)[[
    'accusation_id', 'cr_id', 'final_finding_USE', 'final_outcome_USE'
]]
예제 #7
0
    return d


log.info('Assembling 1967 - 1999 complaints')
c1 = pd.read_csv(cons.input_cmpl1_file)
c1crs = set(c1.cr_id)
c1 = c1[['cr_id', 'incident_date', 'complaint_date', 'closed_date', 'incident_location']]\
    .rename(columns={'incident_location': 'address'})\
    .drop_duplicates()
c1[cons.date_cols] = c1[cons.date_cols].apply(pd.to_datetime)
c1['address'] = c1['address'].fillna('').astype(str)
c1 = \
keep_duplicates(c1, 'cr_id')\
    .groupby('cr_id', as_index=False)\
    .agg({
        'incident_date' : min, 'complaint_date' : min, 'closed_date' : min,
        'address' : lambda x: sorted(x, key=len)[-1]
    })\
    .append(remove_duplicates(c1, 'cr_id'))
assert c1.shape[0] == c1['cr_id'].nunique()
assert set(c1.cr_id) == c1crs
assert c1[c1.cr_id.isnull()].empty

log.info('Assembling 2000 - 2016 complaints')
c2 = pd.read_csv(cons.input_cmpl2_file)
c2crs = set(c2.cr_id)
c2_core = c2\
    .drop(['row_id'] + cons.date_cols, axis=1)\
    .drop_duplicates()

c2_core = keep_duplicates(c2_core, 'cr_id')\
예제 #8
0
def combine_histories(uh_list, resignation_df, log,
                      uid='UID', unit='unit',
                      start='unit_start_date',
                      end='unit_end_date',
                      resignation_col='resignation_date'):
    """Combines multiple unit history dataframes into one
       containing unique unit movements for individuals,
       removing non-sensical data and filling missing data

    Parameters
    ----------
    uh_list : list
        List of unit history pandas DataFrame
    resignation_df : pandas DataFrame
        Contains data on resignation dates
    log : logging object
    uid : str
        Name of unique ID column in unit history and resignation date DataFrames
    unit : str
        Name of unit column in unit history DataFrames in uh_list
    start : str
        Name of unit start date column in unit history DataFrames in uh_list
    end : str
        Name of unit end date column in unit history DataFrames in uh_list
    resignation_col : str
        Name of resignation date column in resignation_df DataFrames

    Returns
    -------
    uh_df : pandas DataFrame
    """
    from assign_unique_ids_functions import aggregate_data

    uh_df = pd.DataFrame()

    for df in uh_list:
        df = df.loc[:, [uid, unit, start, end]]
        df.dropna(subset=[unit, uid, start],
                  how='any', inplace=True)
        log.info(('%d rows with non-NA end date and end date '
                  'before/equal to start date'
                  ''), df[(df[end].notnull()) &
                          (df[end] <= df[start])].shape[0])
        df.loc[(df[end].notnull()) &
               (df[end] <= df[start]),
               end] = np.nan
        uh_df = uh_df.append(df)

    uh_df.drop_duplicates(inplace=True)
    uh_df = uh_df.merge(resignation_df,
                        on=uid, how='left')
    indexes = ((uh_df[resignation_col].notnull()) &
               (uh_df[end].isnull()) &
               (uh_df[start] < uh_df[resignation_col]))
    uh_df.loc[indexes, end] = uh_df.loc[indexes, resignation_col]

    uh_df.drop(resignation_col, axis=1, inplace=True)

    uh_rd = remove_duplicates(uh_df, [uid, start, unit])
    uh_kd = keep_duplicates(uh_df, [uid, start, unit])
    uh_kd = aggregate_data(uh_kd,
                           uid=uid, id_cols=[start, unit],
                           max_cols=[end])

    assert uh_rd.shape[0] + uh_kd.shape[0] ==\
        uh_df[[uid, unit, start]].drop_duplicates().shape[0],\
        'Data set lost information after split and aggregation.'

    uh_df = uh_rd.append(uh_kd)
    uh_df.sort_values([uid, start, unit], inplace=True)
    uh_df.reset_index(drop=True, inplace=True)

    return uh_df