def test_resolve_units_case9(): ''' tests resolve_units case 9''' input_df = pd.read_csv("unit_history_files/case9.csv.gz") output_df = pd.DataFrame( { 'unit': [128.0, 23.0, 10.0, 2.0, 2.0, 24.0, 9.0, 4.0, 19.0, 18.0, 12.0], 'unit_start_date': pd.to_datetime([ '1932-09-01', '1961-01-01', '1961-12-16', '1962-07-01', '1964-01-01', '1965-11-15', '1965-11-19', '1967-12-05', '1970-04-24', '1972-01-01', '1977-12-08' ]), 'unit_end_date': pd.to_datetime([ '1960-12-31', '1961-12-15', '1962-06-30', '1963-12-31', '1965-11-14', '1965-11-18', '1967-12-04', '1970-04-23', '1971-12-31', '1977-12-07', TODAY ]), 'CODE': [1, 1, 1, 1, -4, 0, 1, 1, 1, 1, 0] }, columns=['unit', 'unit_start_date', 'unit_end_date', 'CODE']) results = unit_history_functions.resolve_units(input_df) assert results.equals(output_df)
def test_resolve_units_case11(): ''' tests resolve_units case 11''' input_df = pd.read_csv("unit_history_files/case11.csv.gz") output_df = pd.DataFrame( { 'unit': [54.0, 50.0], 'unit_start_date': pd.to_datetime(['1954-10-01', '1982-03-04']), 'unit_end_date': pd.to_datetime(['1981-08-09', TODAY]), 'CODE': [1, 0] }, columns=['unit', 'unit_start_date', 'unit_end_date', 'CODE']) results = unit_history_functions.resolve_units(input_df) assert results.equals(output_df)
def test_resolve_units_case10(): ''' tests resolve_units case 10''' input_df = pd.read_csv("unit_history_files/case10.csv.gz") output_df = pd.DataFrame( { 'unit': [620.0], 'unit_start_date': pd.to_datetime(['1996-12-15']), 'unit_end_date': pd.to_datetime([TODAY]), 'CODE': [0] }, columns=['unit', 'unit_start_date', 'unit_end_date', 'CODE']) results = unit_history_functions.resolve_units(input_df) assert results.equals(output_df)
def test_resolve_units_case5(): ''' tests resolve_units case 5''' input_df = pd.read_csv("unit_history_files/case5.csv.gz") output_df = pd.DataFrame( { 'unit': [6.0, 5.0], 'unit_start_date': pd.to_datetime(['1986-11-13', '1995-08-17']), 'unit_end_date': pd.to_datetime(['1995-08-16', TODAY]), 'CODE': [1, 0] }, columns=['unit', 'unit_start_date', 'unit_end_date', 'CODE']) results = unit_history_functions.resolve_units(input_df) assert results.equals(output_df)
def test_resolve_units_case1(): ''' tests resolve_units case 1''' input_df = pd.read_csv("unit_history_files/case1.csv.gz") output_df = pd.DataFrame( { 'unit': [9.0, 3.0, 8.0], 'unit_start_date': pd.to_datetime(['2006-05-25', '2016-02-07', '2016-02-28']), 'unit_end_date': pd.to_datetime(['2016-02-06', '2016-02-27', TODAY]), 'CODE': [1, 1, 0] }, columns=['unit', 'unit_start_date', 'unit_end_date', 'CODE']) results = unit_history_functions.resolve_units(input_df) assert results.equals(output_df)
def test_resolve_units_case3(): ''' tests resolve_units case 3''' input_df = pd.read_csv("unit_history_files/case3.csv.gz") output_df = pd.DataFrame( { 'unit': [44.0, 6.0, 620.0, 2.0], 'unit_start_date': pd.to_datetime( ['1990-08-27', '1992-02-06', '2000-09-01', '2016-06-05']), 'unit_end_date': pd.to_datetime(['1992-02-05', '2000-08-31', '2016-06-04', TODAY]), 'CODE': [1, 1, 1, 0] }, columns=['unit', 'unit_start_date', 'unit_end_date', 'CODE']) results = unit_history_functions.resolve_units(input_df) assert results.equals(output_df)
def test_resolve_units_case8(): ''' tests resolve_units case 8''' input_df = pd.read_csv("unit_history_files/case8.csv.gz") output_df = pd.DataFrame( { 'unit': [22.0, 11.0, 15.0, 152.0, 144.0, 50.0, 19.0, 23.0], 'unit_start_date': pd.to_datetime([ '1963-12-02', '1964-03-01', '1966-10-16', '1970-03-05', '1971-01-01', '1971-04-16', '1974-12-02', '1977-08-23' ]), 'unit_end_date': pd.to_datetime([ '1964-02-29', '1966-10-15', '1970-03-04', '1970-12-31', '1971-04-15', '1974-12-01', '1977-08-22', TODAY ]), 'CODE': [1, 1, 1, 1, 1, 1, 1, 0] }, columns=['unit', 'unit_start_date', 'unit_end_date', 'CODE']) results = unit_history_functions.resolve_units(input_df) assert results.equals(output_df)
def test_resolve_units_case7(): ''' tests resolve_units case 7''' input_df = pd.read_csv("unit_history_files/case7.csv.gz") output_df = pd.DataFrame( { 'unit': [44.0, 19.0, 85.0, 189.0, 17.0, 84.0, 543.0, 11.0], 'unit_start_date': pd.to_datetime([ '1966-08-15', '1966-11-20', '1967-02-02', '1970-07-23', '1971-08-16', '1979-10-16', '1984-05-24', '1985-01-31' ]), 'unit_end_date': pd.to_datetime([ '1966-11-19', '1967-02-01', '1970-07-22', '1971-08-15', '1979-10-15', '1984-05-23', '1985-01-30', TODAY ]), 'CODE': [1, 1, 1, 1, -4, 1, 1, 0] }, columns=['unit', 'unit_start_date', 'unit_end_date', 'CODE']) results = unit_history_functions.resolve_units(input_df) assert results.equals(output_df)
def test_resolve_units_case4(): ''' tests resolve_units case 4''' input_df = pd.read_csv("unit_history_files/case4.csv.gz") output_df = pd.DataFrame( { 'unit': [15.0, 1.0, 153.0, 23.0, 45.0, 189.0, 191.0], 'unit_start_date': pd.to_datetime([ '2000-08-14', '2001-07-19', '2007-03-01', '2007-11-08', '2009-08-13', '2011-03-31', '2013-06-23' ]), 'unit_end_date': pd.to_datetime([ '2001-07-18', '2007-02-28', '2007-11-07', '2009-08-12', '2011-03-30', '2013-06-22', TODAY ]), 'CODE': [1, 1, 1, 1, 1, 1, 0] }, columns=['unit', 'unit_start_date', 'unit_end_date', 'CODE']) results = unit_history_functions.resolve_units(input_df) assert results.equals(output_df)
def test_resolve_units_case12(): ''' tests resolve_units case 12''' input_df = pd.read_csv("unit_history_files/case12.csv.gz") output_df = pd.DataFrame( { 'ID': [ 2, 2, 2, 2, ], 'unit': [44.0, 10.0, 193.0, 10.0], 'unit_start_date': pd.to_datetime( ['2006-06-26', '2007-12-06', '2011-01-06', '2011-09-15']), 'unit_end_date': pd.to_datetime(['2007-12-05', '2011-01-05', '2011-09-14', TODAY]), 'CODE': [1, 1, 1, 0] }, columns=['ID', 'unit', 'unit_start_date', 'unit_end_date', 'CODE']) results = unit_history_functions.resolve_units(input_df) assert results.equals(output_df)
def test_resolve_units_case6(): ''' tests resolve_units case 6''' input_df = pd.read_csv("unit_history_files/case6.csv.gz") output_df = pd.DataFrame( { 'unit': [22.0, 6.0, 189.0, 92.0, 7.0, 15.0, 4.0, 15.0, 2.0], 'unit_start_date': pd.to_datetime([ '1981-01-19', '1982-02-04', '1991-04-25', '1991-05-23', '1992-09-17', '1998-12-10', '1999-06-16', '2000-01-06', '2007-03-29' ]), 'unit_end_date': pd.to_datetime([ '1982-02-03', '1991-04-24', '1991-05-22', '1992-09-16', '1998-12-09', '1999-06-15', '2000-01-05', '2007-03-28', TODAY ]), 'CODE': [1, 1, 1, 1, 1, 1, 1, 1, 0] }, columns=['unit', 'unit_start_date', 'unit_end_date', 'CODE']) results = unit_history_functions.resolve_units(input_df) assert results.equals(output_df)
uh_list = [pd.read_csv(f) for f in cons.unit_history_files] uh_df = pd.DataFrame() for df in uh_list: df = df.loc[:, [UID, UNIT, START, END]] df.dropna(subset=[UNIT, UID, START], how='any', inplace=True) df[START] = pd.to_datetime(df[START]) df[END] = pd.to_datetime(df[END]) # mark erroneous with negative time, fill with NA -> 1 case1 = df[END].notnull() & (df[END] <= df[START]) # non-erroneous with positive time but no end date -> 2 case2 = df[END].isnull() # non-erroneous with positive time and end date -> 3 case3 = df[END].notnull() & (df[END] > df[START]) assert sum(case1 & case2 & case3) == 0 assert sum(case1) + sum(case2) + sum(case3) == df.shape[0] df.loc[case1, 'value'] = 1 df.loc[case2, 'value'] = 2 df.loc[case3, 'value'] = 3 uh_df = uh_df.append(df) uh_df = uh_df.drop_duplicates() nUIDs = uh_df[UID].nunique() log.info("Starting resolve_units") resolved = pd.concat([ resolve_units(g, START, END, UNIT) for k, g in uh_df[[UID, UNIT, START, END]].groupby(UID, as_index=False) ]) log.info("Done resolve_units") assert resolved[UID].nunique() == nUIDs, 'Lost UIDs during resolve_units' resolved.to_csv(cons.output_file, **cons.csv_opts)