Пример #1
0
    def final_profiles(self,
                       aggregate_data_args,
                       output_path='',
                       column_order=[],
                       csv_opts={},
                       include_IDs=True):
        """Generates unique profiles from reference data and writes to csv

        Parameters
        ----------
        aggregate_data_args : dict
            Dictionary of arguments for aggregate_data
        column_order : list
            List of columns in specified order
        output_path : str
        csv_opts : dict
        include_IDs : bool
            If True, keep ID columns, if False drop them

        Returns
        ----------
        self
        """

        from assign_unique_ids_functions import aggregate_data
        self.generate_foia_dates()
        profiles = aggregate_data(self.ref_df, self.uid, **aggregate_data_args)
        count_df = pd.DataFrame(self.ref_df[[
            col for col in self.ref_df.columns
            if col.endswith("_ID") or col == self.uid
        ]].drop_duplicates()[self.uid].value_counts())
        count_df.columns = ['profile_count']
        count_df[self.uid] = count_df.index
        profiles = profiles.merge(count_df, on=self.uid)
        assert profiles.shape[0] == self.ref_df[self.uid].nunique(),\
            print(profiles.shape[0], self.ref_df[self.uid].nunique())

        if include_IDs:
            ID_cols = [col for col in profiles.columns if col.endswith('_ID')]
        else:
            ID_cols = []

        if column_order:
            cols = [col for col in column_order if col in profiles.columns]

        profiles = profiles[[self.uid] + cols + ID_cols + ['profile_count']]
        self.log.info('Officer profile count: {}'.format(profiles.shape[0]))

        if output_path:
            profiles.to_csv(output_path, **csv_opts)
        else:
            self.profiles = profiles
        return self
def test_aggregate_data():
    ''' test aggregate_data'''
    input_df = pd.DataFrame({
        'uid': [1, 1, 1, 1, 1, 1, 4, 4, 4, 99, 99],
        'ID': ['A', 'A', 'A', 'A', 'A', 'A', 'B', 'B', 'B', 'C', 'C'],
        'mode': [2, 2, 2, 3, 3, 3, 1, np.nan, 4, 5, 5],
        'date_of_age_obs': [
            '2014-01-01', '2014-03-01', '2015-01-01', '2015-09-01',
            '2016-10-01', '2015-11-01', '2015-01-01', '2016-01-01', np.nan,
            np.nan, np.nan
        ],
        'age': [20, 20, 21, 22, 23, 22, 56, 57, 90, 35, 30],
        'max': [1, np.nan, 10, 1, 3, 9, 2, 2, -2, np.nan, np.nan],
        'max_names': [
            'One', np.nan, 'Ten', 'One', 'Three', 'Nine', 'Two', 'Two', '-Two',
            np.nan, np.nan
        ]
    })
    orig_input_df = copy.deepcopy(input_df)
    input_args = {
        'uid': 'uid',
        'id_cols': ['ID'],
        'mode_cols': ['mode'],
        'max_cols': ['max'],
        'current_cols': ['age'],
        'time_col': 'date_of_age_obs',
        'merge_cols': ['max_names'],
        'merge_on_cols': ['max']
    }

    output_df = pd.DataFrame(
        {
            'uid': [1, 4, 99],
            'ID': ['A', 'B', 'C'],
            'mode': [2.0, 1.0, 5.0],
            'max': [10, 2, np.nan],
            'current_age': [23, 57, np.nan],
            'max_names': ['Ten', 'Two', np.nan]
        },
        columns=['uid', 'ID', 'mode', 'max', 'current_age', 'max_names'])

    results = assign_unique_ids_functions.aggregate_data(
        input_df, **input_args)
    assert results.equals(output_df)
    assert orig_input_df.equals(input_df)
    }

    assert (args['input_file'].startswith('input/') and
            args['input_file'].endswith('.csv.gz')),\
        "input_file is malformed: {}".format(args['input_file'])
    assert (args['output_file'].startswith('output/') and
            args['output_file'].endswith('.csv.gz')),\
        "output_file is malformed: {}".format(args['output_file'])

    return setup.do_setup(script_path, args)


cons, log = get_setup()

df = pd.read_csv(cons.input_file)

df = assign_unique_ids(df, cons.cr_uid, **cons.cr_auid, log=log)
df = assign_unique_ids(df, cons.ind_uid, **cons.ind_auid, log=log)
udf = union_group(df[[cons.cr_uid, cons.ind_uid]].drop_duplicates(), cons.id,
                  [cons.cr_uid, cons.ind_uid])
df = df.merge(udf, on=[cons.cr_uid, cons.ind_uid])\
    .drop(cons.cr_uid, axis=1)
df.to_csv(cons.output_file, **cons.csv_opts)

profiles_df = aggregate_data(df,
                             cons.id,
                             cons.ind_auid['id_cols'],
                             max_cols=cons.ind_auid['conflict_cols'] +
                             ['star'])
profiles_df.to_csv(cons.output_profiles_file, **cons.csv_opts)
Пример #4
0
        'output_demo_file':
        'output/settlements_1952-2016_2017-01_profiles.csv.gz',
        'id_cols': [
            "first_name", "last_name", "first_name_NS", "last_name_NS", "star",
            "current_status", "officer_id", "service_years", "service_months",
            "suffix_name", "cost", "rank", "race", "gender"
        ],
        'id':
        'settlements_1952-2016_2017-01_ID'
    }

    assert (args['input_file'].startswith('input/') and
            args['input_file'].endswith('.csv.gz')),\
        "input_file is malformed: {}".format(args['input_file'])
    assert (args['output_file'].startswith('output/') and
            args['output_file'].endswith('.csv.gz')),\
        "output_file is malformed: {}".format(args['output_file'])

    return setup.do_setup(script_path, args)


cons, log = get_setup()

df = pd.read_csv(cons.input_file)

df = assign_unique_ids(df, cons.id, cons.id_cols, log=log)
df.to_csv(cons.output_file, **cons.csv_opts)

profiles_df = aggregate_data(df, cons.id, cons.id_cols)
profiles_df.to_csv(cons.output_demo_file, **cons.csv_opts)
        'output_profiles_file':
        'output/TRR-officers_2004-2016_2016-09_profiles.csv.gz',
        'id_cols': [
            "first_name", "last_name", "first_name_NS", "last_name_NS",
            "middle_initial", 'middle_initial2', "suffix_name",
            "appointed_date", "gender", "race", "current_star"
        ],
        'id':
        'TRR-officers_2004-2016_2016-09_ID'
    }

    assert (args['input_file'].startswith('input/') and
            args['input_file'].endswith('.csv.gz')),\
        "input_file is malformed: {}".format(args['input_file'])
    assert (args['output_file'].startswith('output/') and
            args['output_file'].endswith('.csv.gz')),\
        "output_file is malformed: {}".format(args['output_file'])

    return setup.do_setup(script_path, args)


cons, log = get_setup()

df = pd.read_csv(cons.input_file)

df = assign_unique_ids(df, cons.id, cons.id_cols, log=log)
df.to_csv(cons.output_file, **cons.csv_opts)

agg_df = aggregate_data(df, cons.id, cons.id_cols)
agg_df.to_csv(cons.output_profiles_file, **cons.csv_opts)
Пример #6
0
cons, log = get_setup()

df = pd.read_csv(cons.input_file)

po_df = df[df['first_name'] == 'POLICE']
log.info('{} hidden officers marked as merge = 0'.format(po_df.shape[0]))
log.info(('{} officer with no name marked as merge = 0'
          '').format(df[df['first_name'].isnull()].shape[0]))

df.loc[(df['first_name'].notnull()) & (df['first_name'] != 'POLICE'),
       'merge'] = 1
df['merge'] = df['merge'].fillna(0)

df = assign_unique_ids(df,
                       cons.id,
                       cons.id_cols + ['merge'],
                       conflict_cols=cons.conflict_cols,
                       log=log)

df.to_csv(cons.output_file, **cons.csv_opts)

profiles_df = aggregate_data(df,
                             cons.id,
                             cons.id_cols + ['merge'],
                             max_cols=cons.max_cols + cons.conflict_cols,
                             merge_cols=cons.merge_cols,
                             merge_on_cols=cons.merge_on_cols)

profiles_df.to_csv(cons.output_profiles_file, **cons.csv_opts)
Пример #7
0
        'output_profiles_file':
        'output/complaints-investigators_2000-2016_2016-11_profiles.csv.gz',
        'id_cols': [
            'first_name', 'last_name', 'appointed_date', 'first_name_NS',
            'last_name_NS', 'middle_initial'
        ],
        'max_cols': ['current_unit', 'current_star', 'current_rank'],
        'id':
        'complaints-investigators_2000-2016_2016-11_ID'
    }

    assert (args['input_file'].startswith('input/') and
            args['input_file'].endswith('.csv.gz')),\
        "input_file is malformed: {}".format(args['input_file'])
    assert (args['output_file'].startswith('output/') and
            args['output_file'].endswith('.csv.gz')),\
        "output_file is malformed: {}".format(args['output_file'])

    return setup.do_setup(script_path, args)


cons, log = get_setup()

df = pd.read_csv(cons.input_file)

df = assign_unique_ids(df, cons.id, cons.id_cols, log=log)
df.to_csv(cons.output_file, **cons.csv_opts)

profiles_df = aggregate_data(df, cons.id, cons.id_cols, max_cols=cons.max_cols)
profiles_df.to_csv(cons.output_profiles_file, **cons.csv_opts)
        'id': 'subject_ID'
    }

    assert (args['input_file'].startswith('input/') and
            args['input_file'].endswith('.csv.gz')),\
        "input_file is malformed: {}".format(args['input_file'])
    assert (args['output_file'].startswith('output/') and
            args['output_file'].endswith('.csv.gz')),\
        "output_file is malformed: {}".format(args['output_file'])

    return setup.do_setup(script_path, args)


cons, log = get_setup()

df = pd.read_csv(cons.input_file)
df = union_group(df, cons.id_cols[0], cons.group_cols)
log.info('%d group_ids' % df[cons.id_cols[0]].nunique())
df = assign_unique_ids(df,
                       cons.id,
                       cons.id_cols,
                       cons.conflict_cols,
                       log=log,
                       unresolved_policy='distinct')
adf = aggregate_data(df,
                     cons.id,
                     id_cols=cons.id_cols,
                     max_cols=cons.max_cols + cons.conflict_cols)
df.to_csv(cons.output_file, **cons.csv_opts)
adf.to_csv(cons.output_profiles_file, **cons.csv_opts)
cons, log = get_setup()

df = pd.read_csv(cons.input_file)

df["Specify"] = 0
res1_units = [5, 602]
df.loc[(df["first_name"] == "ROBERT") & (df["last_name"] == "SMITH") &
       (df["middle_initial"] == "E") & (df["birth_year"] == 1947) &
       (df["appointed_date"] == "1971-02-22") & (df["unit"].isin(res1_units)),
       "Specify"] = 1
log.info(("Robert E Smith 1947 1971-02-22 in units {}"
          " specified as singular individual.").format(res1_units))

df = assign_unique_ids(df,
                       cons.id,
                       cons.id_cols + ["Specify"],
                       cons.conflict_cols,
                       log=log)
del df["Specify"]
log.info(("Specify column used to manually distinguish individuals"
          " created for AUID then dropped before aggregation"))
df.to_csv(cons.output_file, **cons.csv_opts)

profiles_df = aggregate_data(df,
                             cons.id,
                             cons.id_cols,
                             max_cols=cons.conflict_cols + cons.max_cols,
                             current_cols=cons.current_cols,
                             time_col=cons.time_col)
profiles_df.to_csv(cons.output_profiles_file, **cons.csv_opts)
Пример #10
0
def combine_histories(uh_list, resignation_df, log,
                      uid='UID', unit='unit',
                      start='unit_start_date',
                      end='unit_end_date',
                      resignation_col='resignation_date'):
    """Combines multiple unit history dataframes into one
       containing unique unit movements for individuals,
       removing non-sensical data and filling missing data

    Parameters
    ----------
    uh_list : list
        List of unit history pandas DataFrame
    resignation_df : pandas DataFrame
        Contains data on resignation dates
    log : logging object
    uid : str
        Name of unique ID column in unit history and resignation date DataFrames
    unit : str
        Name of unit column in unit history DataFrames in uh_list
    start : str
        Name of unit start date column in unit history DataFrames in uh_list
    end : str
        Name of unit end date column in unit history DataFrames in uh_list
    resignation_col : str
        Name of resignation date column in resignation_df DataFrames

    Returns
    -------
    uh_df : pandas DataFrame
    """
    from assign_unique_ids_functions import aggregate_data

    uh_df = pd.DataFrame()

    for df in uh_list:
        df = df.loc[:, [uid, unit, start, end]]
        df.dropna(subset=[unit, uid, start],
                  how='any', inplace=True)
        log.info(('%d rows with non-NA end date and end date '
                  'before/equal to start date'
                  ''), df[(df[end].notnull()) &
                          (df[end] <= df[start])].shape[0])
        df.loc[(df[end].notnull()) &
               (df[end] <= df[start]),
               end] = np.nan
        uh_df = uh_df.append(df)

    uh_df.drop_duplicates(inplace=True)
    uh_df = uh_df.merge(resignation_df,
                        on=uid, how='left')
    indexes = ((uh_df[resignation_col].notnull()) &
               (uh_df[end].isnull()) &
               (uh_df[start] < uh_df[resignation_col]))
    uh_df.loc[indexes, end] = uh_df.loc[indexes, resignation_col]

    uh_df.drop(resignation_col, axis=1, inplace=True)

    uh_rd = remove_duplicates(uh_df, [uid, start, unit])
    uh_kd = keep_duplicates(uh_df, [uid, start, unit])
    uh_kd = aggregate_data(uh_kd,
                           uid=uid, id_cols=[start, unit],
                           max_cols=[end])

    assert uh_rd.shape[0] + uh_kd.shape[0] ==\
        uh_df[[uid, unit, start]].drop_duplicates().shape[0],\
        'Data set lost information after split and aggregation.'

    uh_df = uh_rd.append(uh_kd)
    uh_df.sort_values([uid, start, unit], inplace=True)
    uh_df.reset_index(drop=True, inplace=True)

    return uh_df