예제 #1
0
def test_cleaning_pipeline():
    """This function tests our cleaning pipeline to make sure that
    garbage values are removed and ranks are create
    """
    # read in the df using our function in order to pass to later tests
    # read in df using your function and then using pandas regular csv read, then compare the resulting dfs
    df = prep.read_then_clean(FILEPATH, STR_VARS)

    # also passed it through the rest of the cleaning pipeline on order to compare df to df_clean
    df_clean = prep.remove_garbage_codes(df, STR_VARS, STR_GARBAGE)
    df_clean = prep.extract_ranking(df_clean, NUM_VARS)
    df_clean = prep.remove_garbage_codes(df_clean, RANK_VARS, RANK_GARBAGE)

    # assert that rankings were generated in the next step of the pipeline
    for x in RANK_VARS:
        # verify that it wasnt originally present in df
        assert (x in df) == False, "rank column present in raw data"
        # assert that this column was added
        assert x in df_clean, "rank column was not added by extract_ranking fx"

    # assert that garbage was removed
    for x in STR_VARS:
        for y in STR_GARBAGE:
            print(x, y)
            # assert that it is removed
            assert (y in df_clean[x].unique(
            )) == False, "garbage values not removed from clean dataframe"
DEP_VAR = "housing_roof"
PRED_VAR = DEP_VAR + "_rank" #will always be using the strings to predict ranking

#setup a filter to select which surveys you want to work with
SVY_FILTER = ['MACRO_DHS']

#garbage lists
STR_GARBAGE = ['nan', 'other', 'not a dejure resident', 'not dejure resident']
RANK_GARBAGE = ['4', '5', '6', '7', '8', '9', 'n']

#dictionaries
PRED_DICT = {'natural':'1', 'rudimentary':'2', 'finished':'3'} #map categories back to ranks

df = prep.read_then_clean(DATA_DIR + "/" + DATA_FILENAME, STR_VARS, SVY_FILTER)
df_clean = prep.remove_garbage_codes(df, STR_VARS, STR_GARBAGE)
df_clean = prep.extract_ranking(df_clean, NUM_VARS)
df_clean = prep.remove_garbage_codes(df_clean, RANK_VARS, RANK_GARBAGE)
df_clean = df_clean.dropna(subset=[DEP_VAR])

class FilterOneWordMaterials(unittest.TestCase):

    """Tests for `filter_one_word_materials.py`."""

    def test_expected_number_of_rows(self, df_clean):
        """Has the function successfully filtered out all the materials described with more than one word?"""

        df = df_clean[0:20]
        self.assertTrue(sem.filter_one_word_materials(df, DEP_VAR).shape[0], sum(df[DEP_VAR].str.get_dummies(sep=' ').T.sum() == 1))

    def test_raise_error_if_no_material_with_one_word(self, df_clean):
        """Does the function raise an error if there is no material described with one word in the corpus?"""