def test_assign_unique_ids():
    '''test assign_unique_ids
       does not test report generation
    '''
    input_df = pd.DataFrame({
        'A': [1, 1, 1, 2, 3, 3, 4, 4, 5, 5, 5],
        'B': [2, np.nan, 2, 3, 4, 4, 2, np.nan, 3, 4, 4],
        'C': [2, np.nan, np.nan, 3, 1, 1, np.nan, 5, np.nan, np.nan, np.nan]
    })
    orig_input_df = copy.deepcopy(input_df)
    input_args = {
        'uid': 'ID',
        'id_cols': ['A'],
        'conflict_cols': ['B', 'C'],
        'log': False
    }

    output_df = pd.DataFrame({
        'ID': [3.0, 3.0, 3.0, 1.0, 2.0, 2.0, 4.0, 4.0, 5.0, 6.0, 6.0],
        'A': [1, 1, 1, 2, 3, 3, 4, 4, 5, 5, 5],
        'B': [2, np.nan, 2, 3, 4, 4, 2, np.nan, 3, 4, 4],
        'C': [2, np.nan, np.nan, 3, 1, 1, np.nan, 5, np.nan, np.nan, np.nan]
    })

    results = assign_unique_ids_functions.assign_unique_ids(
        input_df, **input_args)
    assert results.equals(output_df)
    assert orig_input_df.equals(input_df)
def test_assign_unique_ids_with_empty_kd_df():
    '''test assign_unique_ids with an empty kd_df and conflict_cols
       does not test report generation
    '''
    input_df = pd.DataFrame({
        'A': [1, 1, 2, 3],
        'B': [2, 2, np.nan, 3],
        'C': [1, 2, 3, 4]
    })
    orig_input_df = copy.deepcopy(input_df)
    input_args = {
        'uid': 'ID',
        'id_cols': ['A'],
        'conflict_cols': ['B'],
        'log': False,
        'unresolved_policy': 'distinct'
    }

    output_df = pd.DataFrame({
        'A': [1, 1, 2, 3],
        'B': [2, 2, np.nan, 3],
        'C': [1, 2, 3, 4],
        'ID': [1, 1, 2, 3]
    })

    results = assign_unique_ids_functions.assign_unique_ids(
        input_df, **input_args)
    assert results.equals(output_df)
    assert orig_input_df.equals(input_df)
def test_assign_unique_ids_many_nans():
    '''test assign_unique_ids with unresolved_policy = 'distinct'
       but with many nans
       does not test report generation
    '''
    input_df = pd.DataFrame({
        'A': [1, 1, 1, 1, 2, 2, 2, 2],
        'B': [1, 2, 2, 2, 4, 4, 4, 3],
        'C': [2, 2, 3, np.nan, 1, 2, np.nan, 1]
    })
    orig_input_df = copy.deepcopy(input_df)
    input_args = {
        'uid': 'ID',
        'id_cols': ['A'],
        'conflict_cols': ['B', 'C'],
        'log': False,
        'unresolved_policy': 'distinct'
    }

    output_df = pd.DataFrame({
        'A': [1, 1, 1, 1, 2, 2, 2, 2],
        'B': [1, 2, 2, 2, 4, 4, 4, 3],
        'C': [2, 2, 3, np.nan, 1, 2, np.nan, 1],
        'ID': [1.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 2.0]
    })

    results = assign_unique_ids_functions.assign_unique_ids(
        input_df, **input_args)
    assert results.equals(output_df)
    assert orig_input_df.equals(input_df)
def test_assign_unique_ids_unresolved_same():
    '''test assign_unique_ids with unresolved_policy = 'same'
       does not test report generation
    '''
    input_df = pd.DataFrame({
        'A': [1, 1, 1, 1, 2, 2, 2, 2, 3],
        'B': [2, np.nan, 3, 2, 3, 4, 4, 4, 3],
        'C': [2, np.nan, np.nan, np.nan, 1, 1, np.nan, 2, 3]
    })
    orig_input_df = copy.deepcopy(input_df)
    input_args = {
        'uid': 'ID',
        'id_cols': ['A'],
        'conflict_cols': ['B', 'C'],
        'log': False,
        'unresolved_policy': 'same'
    }

    output_df = pd.DataFrame({
        'A': [1, 1, 1, 1, 2, 2, 2, 2, 3],
        'B': [2, np.nan, 3, 2, 3, 4, 4, 4, 3],
        'C': [2, np.nan, np.nan, np.nan, 1, 1, np.nan, 2, 3],
        'ID': [3.0, 3.0, 3.0, 3.0, 2.0, 4.0, 4.0, 4.0, 1.0]
    })

    results = assign_unique_ids_functions.assign_unique_ids(
        input_df, **input_args)
    assert results.equals(output_df)
    assert orig_input_df.equals(input_df)
    }

    assert (args['input_file'].startswith('input/') and
            args['input_file'].endswith('.csv.gz')),\
        "input_file is malformed: {}".format(args['input_file'])
    assert (args['output_file'].startswith('output/') and
            args['output_file'].endswith('.csv.gz')),\
        "output_file is malformed: {}".format(args['output_file'])

    return setup.do_setup(script_path, args)


cons, log = get_setup()

df = pd.read_csv(cons.input_file)

df = assign_unique_ids(df, cons.cr_uid, **cons.cr_auid, log=log)
df = assign_unique_ids(df, cons.ind_uid, **cons.ind_auid, log=log)
udf = union_group(df[[cons.cr_uid, cons.ind_uid]].drop_duplicates(), cons.id,
                  [cons.cr_uid, cons.ind_uid])
df = df.merge(udf, on=[cons.cr_uid, cons.ind_uid])\
    .drop(cons.cr_uid, axis=1)
df.to_csv(cons.output_file, **cons.csv_opts)

profiles_df = aggregate_data(df,
                             cons.id,
                             cons.ind_auid['id_cols'],
                             max_cols=cons.ind_auid['conflict_cols'] +
                             ['star'])
profiles_df.to_csv(cons.output_profiles_file, **cons.csv_opts)
示例#6
0
        'output_demo_file':
        'output/settlements_1952-2016_2017-01_profiles.csv.gz',
        'id_cols': [
            "first_name", "last_name", "first_name_NS", "last_name_NS", "star",
            "current_status", "officer_id", "service_years", "service_months",
            "suffix_name", "cost", "rank", "race", "gender"
        ],
        'id':
        'settlements_1952-2016_2017-01_ID'
    }

    assert (args['input_file'].startswith('input/') and
            args['input_file'].endswith('.csv.gz')),\
        "input_file is malformed: {}".format(args['input_file'])
    assert (args['output_file'].startswith('output/') and
            args['output_file'].endswith('.csv.gz')),\
        "output_file is malformed: {}".format(args['output_file'])

    return setup.do_setup(script_path, args)


cons, log = get_setup()

df = pd.read_csv(cons.input_file)

df = assign_unique_ids(df, cons.id, cons.id_cols, log=log)
df.to_csv(cons.output_file, **cons.csv_opts)

profiles_df = aggregate_data(df, cons.id, cons.id_cols)
profiles_df.to_csv(cons.output_demo_file, **cons.csv_opts)
        "output_file is malformed: {}".format(args['output_file'])

    return setup.do_setup(script_path, args)


cons, log = get_setup()

df = pd.read_csv(cons.input_file)

full_df = pd.DataFrame()

for year in df['year'].unique():
    log.info('Assigning unique sub-ids for year: %d', year)
    sub_df = df[df['year']==year]
    sub_df = assign_unique_ids(sub_df, cons.sub_id,
                               cons.sub_id_cols,
                               conflict_cols=cons.sub_conflict_cols,
                               log=log)
    sub_df[cons.sub_id] = sub_df[cons.sub_id] + year * 100000
    full_df = full_df.append(sub_df)

assert full_df.shape[0] == df.shape[0],\
    print('Remerged data does not match input dataset')

df = full_df

log.info("Beginning self-merge process")

for year in range(2002, 2018):
    dfy = df[df['year'] == year].copy()
    yid = cons.year_id.replace('year', str(year))
    dfy.rename(columns={cons.year_id: yid},
示例#8
0
cons, log = get_setup()

df = pd.read_csv(cons.input_file)

po_df = df[df['first_name'] == 'POLICE']
log.info('{} hidden officers marked as merge = 0'.format(po_df.shape[0]))
log.info(('{} officer with no name marked as merge = 0'
          '').format(df[df['first_name'].isnull()].shape[0]))

df.loc[(df['first_name'].notnull()) & (df['first_name'] != 'POLICE'),
       'merge'] = 1
df['merge'] = df['merge'].fillna(0)

df = assign_unique_ids(df,
                       cons.id,
                       cons.id_cols + ['merge'],
                       conflict_cols=cons.conflict_cols,
                       log=log)

df.to_csv(cons.output_file, **cons.csv_opts)

profiles_df = aggregate_data(df,
                             cons.id,
                             cons.id_cols + ['merge'],
                             max_cols=cons.max_cols + cons.conflict_cols,
                             merge_cols=cons.merge_cols,
                             merge_on_cols=cons.merge_on_cols)

profiles_df.to_csv(cons.output_profiles_file, **cons.csv_opts)
        'id': 'subject_ID'
    }

    assert (args['input_file'].startswith('input/') and
            args['input_file'].endswith('.csv.gz')),\
        "input_file is malformed: {}".format(args['input_file'])
    assert (args['output_file'].startswith('output/') and
            args['output_file'].endswith('.csv.gz')),\
        "output_file is malformed: {}".format(args['output_file'])

    return setup.do_setup(script_path, args)


cons, log = get_setup()

df = pd.read_csv(cons.input_file)
df = union_group(df, cons.id_cols[0], cons.group_cols)
log.info('%d group_ids' % df[cons.id_cols[0]].nunique())
df = assign_unique_ids(df,
                       cons.id,
                       cons.id_cols,
                       cons.conflict_cols,
                       log=log,
                       unresolved_policy='distinct')
adf = aggregate_data(df,
                     cons.id,
                     id_cols=cons.id_cols,
                     max_cols=cons.max_cols + cons.conflict_cols)
df.to_csv(cons.output_file, **cons.csv_opts)
adf.to_csv(cons.output_profiles_file, **cons.csv_opts)
cons, log = get_setup()

df = pd.read_csv(cons.input_file)

df["Specify"] = 0
res1_units = [5, 602]
df.loc[(df["first_name"] == "ROBERT") & (df["last_name"] == "SMITH") &
       (df["middle_initial"] == "E") & (df["birth_year"] == 1947) &
       (df["appointed_date"] == "1971-02-22") & (df["unit"].isin(res1_units)),
       "Specify"] = 1
log.info(("Robert E Smith 1947 1971-02-22 in units {}"
          " specified as singular individual.").format(res1_units))

df = assign_unique_ids(df,
                       cons.id,
                       cons.id_cols + ["Specify"],
                       cons.conflict_cols,
                       log=log)
del df["Specify"]
log.info(("Specify column used to manually distinguish individuals"
          " created for AUID then dropped before aggregation"))
df.to_csv(cons.output_file, **cons.csv_opts)

profiles_df = aggregate_data(df,
                             cons.id,
                             cons.id_cols,
                             max_cols=cons.conflict_cols + cons.max_cols,
                             current_cols=cons.current_cols,
                             time_col=cons.time_col)
profiles_df.to_csv(cons.output_profiles_file, **cons.csv_opts)