Пример #1
0
def test_resolve_units_case9():
    ''' tests resolve_units case 9'''
    input_df = pd.read_csv("unit_history_files/case9.csv.gz")
    output_df = pd.DataFrame(
        {
            'unit':
            [128.0, 23.0, 10.0, 2.0, 2.0, 24.0, 9.0, 4.0, 19.0, 18.0, 12.0],
            'unit_start_date':
            pd.to_datetime([
                '1932-09-01', '1961-01-01', '1961-12-16', '1962-07-01',
                '1964-01-01', '1965-11-15', '1965-11-19', '1967-12-05',
                '1970-04-24', '1972-01-01', '1977-12-08'
            ]),
            'unit_end_date':
            pd.to_datetime([
                '1960-12-31', '1961-12-15', '1962-06-30', '1963-12-31',
                '1965-11-14', '1965-11-18', '1967-12-04', '1970-04-23',
                '1971-12-31', '1977-12-07', TODAY
            ]),
            'CODE': [1, 1, 1, 1, -4, 0, 1, 1, 1, 1, 0]
        },
        columns=['unit', 'unit_start_date', 'unit_end_date', 'CODE'])

    results = unit_history_functions.resolve_units(input_df)
    assert results.equals(output_df)
Пример #2
0
def test_resolve_units_case11():
    ''' tests resolve_units case 11'''
    input_df = pd.read_csv("unit_history_files/case11.csv.gz")
    output_df = pd.DataFrame(
        {
            'unit': [54.0, 50.0],
            'unit_start_date': pd.to_datetime(['1954-10-01', '1982-03-04']),
            'unit_end_date': pd.to_datetime(['1981-08-09', TODAY]),
            'CODE': [1, 0]
        },
        columns=['unit', 'unit_start_date', 'unit_end_date', 'CODE'])

    results = unit_history_functions.resolve_units(input_df)
    assert results.equals(output_df)
Пример #3
0
def test_resolve_units_case10():
    ''' tests resolve_units case 10'''
    input_df = pd.read_csv("unit_history_files/case10.csv.gz")
    output_df = pd.DataFrame(
        {
            'unit': [620.0],
            'unit_start_date': pd.to_datetime(['1996-12-15']),
            'unit_end_date': pd.to_datetime([TODAY]),
            'CODE': [0]
        },
        columns=['unit', 'unit_start_date', 'unit_end_date', 'CODE'])

    results = unit_history_functions.resolve_units(input_df)
    assert results.equals(output_df)
Пример #4
0
def test_resolve_units_case5():
    ''' tests resolve_units case 5'''
    input_df = pd.read_csv("unit_history_files/case5.csv.gz")
    output_df = pd.DataFrame(
        {
            'unit': [6.0, 5.0],
            'unit_start_date': pd.to_datetime(['1986-11-13', '1995-08-17']),
            'unit_end_date': pd.to_datetime(['1995-08-16', TODAY]),
            'CODE': [1, 0]
        },
        columns=['unit', 'unit_start_date', 'unit_end_date', 'CODE'])

    results = unit_history_functions.resolve_units(input_df)
    assert results.equals(output_df)
Пример #5
0
def test_resolve_units_case1():
    ''' tests resolve_units case 1'''
    input_df = pd.read_csv("unit_history_files/case1.csv.gz")
    output_df = pd.DataFrame(
        {
            'unit': [9.0, 3.0, 8.0],
            'unit_start_date':
            pd.to_datetime(['2006-05-25', '2016-02-07', '2016-02-28']),
            'unit_end_date':
            pd.to_datetime(['2016-02-06', '2016-02-27', TODAY]),
            'CODE': [1, 1, 0]
        },
        columns=['unit', 'unit_start_date', 'unit_end_date', 'CODE'])

    results = unit_history_functions.resolve_units(input_df)
    assert results.equals(output_df)
Пример #6
0
def test_resolve_units_case3():
    ''' tests resolve_units case 3'''
    input_df = pd.read_csv("unit_history_files/case3.csv.gz")
    output_df = pd.DataFrame(
        {
            'unit': [44.0, 6.0, 620.0, 2.0],
            'unit_start_date':
            pd.to_datetime(
                ['1990-08-27', '1992-02-06', '2000-09-01', '2016-06-05']),
            'unit_end_date':
            pd.to_datetime(['1992-02-05', '2000-08-31', '2016-06-04', TODAY]),
            'CODE': [1, 1, 1, 0]
        },
        columns=['unit', 'unit_start_date', 'unit_end_date', 'CODE'])

    results = unit_history_functions.resolve_units(input_df)
    assert results.equals(output_df)
Пример #7
0
def test_resolve_units_case8():
    ''' tests resolve_units case 8'''
    input_df = pd.read_csv("unit_history_files/case8.csv.gz")
    output_df = pd.DataFrame(
        {
            'unit': [22.0, 11.0, 15.0, 152.0, 144.0, 50.0, 19.0, 23.0],
            'unit_start_date':
            pd.to_datetime([
                '1963-12-02', '1964-03-01', '1966-10-16', '1970-03-05',
                '1971-01-01', '1971-04-16', '1974-12-02', '1977-08-23'
            ]),
            'unit_end_date':
            pd.to_datetime([
                '1964-02-29', '1966-10-15', '1970-03-04', '1970-12-31',
                '1971-04-15', '1974-12-01', '1977-08-22', TODAY
            ]),
            'CODE': [1, 1, 1, 1, 1, 1, 1, 0]
        },
        columns=['unit', 'unit_start_date', 'unit_end_date', 'CODE'])

    results = unit_history_functions.resolve_units(input_df)
    assert results.equals(output_df)
Пример #8
0
def test_resolve_units_case7():
    ''' tests resolve_units case 7'''
    input_df = pd.read_csv("unit_history_files/case7.csv.gz")
    output_df = pd.DataFrame(
        {
            'unit': [44.0, 19.0, 85.0, 189.0, 17.0, 84.0, 543.0, 11.0],
            'unit_start_date':
            pd.to_datetime([
                '1966-08-15', '1966-11-20', '1967-02-02', '1970-07-23',
                '1971-08-16', '1979-10-16', '1984-05-24', '1985-01-31'
            ]),
            'unit_end_date':
            pd.to_datetime([
                '1966-11-19', '1967-02-01', '1970-07-22', '1971-08-15',
                '1979-10-15', '1984-05-23', '1985-01-30', TODAY
            ]),
            'CODE': [1, 1, 1, 1, -4, 1, 1, 0]
        },
        columns=['unit', 'unit_start_date', 'unit_end_date', 'CODE'])

    results = unit_history_functions.resolve_units(input_df)
    assert results.equals(output_df)
Пример #9
0
def test_resolve_units_case4():
    ''' tests resolve_units case 4'''
    input_df = pd.read_csv("unit_history_files/case4.csv.gz")
    output_df = pd.DataFrame(
        {
            'unit': [15.0, 1.0, 153.0, 23.0, 45.0, 189.0, 191.0],
            'unit_start_date':
            pd.to_datetime([
                '2000-08-14', '2001-07-19', '2007-03-01', '2007-11-08',
                '2009-08-13', '2011-03-31', '2013-06-23'
            ]),
            'unit_end_date':
            pd.to_datetime([
                '2001-07-18', '2007-02-28', '2007-11-07', '2009-08-12',
                '2011-03-30', '2013-06-22', TODAY
            ]),
            'CODE': [1, 1, 1, 1, 1, 1, 0]
        },
        columns=['unit', 'unit_start_date', 'unit_end_date', 'CODE'])

    results = unit_history_functions.resolve_units(input_df)
    assert results.equals(output_df)
Пример #10
0
def test_resolve_units_case12():
    ''' tests resolve_units case 12'''
    input_df = pd.read_csv("unit_history_files/case12.csv.gz")
    output_df = pd.DataFrame(
        {
            'ID': [
                2,
                2,
                2,
                2,
            ],
            'unit': [44.0, 10.0, 193.0, 10.0],
            'unit_start_date':
            pd.to_datetime(
                ['2006-06-26', '2007-12-06', '2011-01-06', '2011-09-15']),
            'unit_end_date':
            pd.to_datetime(['2007-12-05', '2011-01-05', '2011-09-14', TODAY]),
            'CODE': [1, 1, 1, 0]
        },
        columns=['ID', 'unit', 'unit_start_date', 'unit_end_date', 'CODE'])

    results = unit_history_functions.resolve_units(input_df)
    assert results.equals(output_df)
Пример #11
0
def test_resolve_units_case6():
    ''' tests resolve_units case 6'''
    input_df = pd.read_csv("unit_history_files/case6.csv.gz")
    output_df = pd.DataFrame(
        {
            'unit': [22.0, 6.0, 189.0, 92.0, 7.0, 15.0, 4.0, 15.0, 2.0],
            'unit_start_date':
            pd.to_datetime([
                '1981-01-19', '1982-02-04', '1991-04-25', '1991-05-23',
                '1992-09-17', '1998-12-10', '1999-06-16', '2000-01-06',
                '2007-03-29'
            ]),
            'unit_end_date':
            pd.to_datetime([
                '1982-02-03', '1991-04-24', '1991-05-22', '1992-09-16',
                '1998-12-09', '1999-06-15', '2000-01-05', '2007-03-28', TODAY
            ]),
            'CODE': [1, 1, 1, 1, 1, 1, 1, 1, 0]
        },
        columns=['unit', 'unit_start_date', 'unit_end_date', 'CODE'])

    results = unit_history_functions.resolve_units(input_df)
    assert results.equals(output_df)
uh_list = [pd.read_csv(f) for f in cons.unit_history_files]
uh_df = pd.DataFrame()
for df in uh_list:
    df = df.loc[:, [UID, UNIT, START, END]]
    df.dropna(subset=[UNIT, UID, START], how='any', inplace=True)
    df[START] = pd.to_datetime(df[START])
    df[END] = pd.to_datetime(df[END])
    # mark erroneous with negative time, fill with NA -> 1
    case1 = df[END].notnull() & (df[END] <= df[START])
    # non-erroneous with positive time but no end date -> 2
    case2 = df[END].isnull()
    # non-erroneous with positive time and end date -> 3
    case3 = df[END].notnull() & (df[END] > df[START])
    assert sum(case1 & case2 & case3) == 0
    assert sum(case1) + sum(case2) + sum(case3) == df.shape[0]
    df.loc[case1, 'value'] = 1
    df.loc[case2, 'value'] = 2
    df.loc[case3, 'value'] = 3
    uh_df = uh_df.append(df)
uh_df = uh_df.drop_duplicates()
nUIDs = uh_df[UID].nunique()

log.info("Starting resolve_units")
resolved = pd.concat([
    resolve_units(g, START, END, UNIT)
    for k, g in uh_df[[UID, UNIT, START, END]].groupby(UID, as_index=False)
])
log.info("Done resolve_units")
assert resolved[UID].nunique() == nUIDs, 'Lost UIDs during resolve_units'
resolved.to_csv(cons.output_file, **cons.csv_opts)