def test_clean(self): values = pd.Series( ['Mary-ann', 'Bob :)', 'Angel', 'Bob (alias Billy)', 'Mary ann', 'John', np.nan ]) expected = pd.Series( ['mary ann', 'bob', 'angel', 'bob', 'mary ann', 'john', np.nan]) clean_series = clean(values) # Check if series are identical. pdt.assert_series_equal(clean_series, expected) clean_series_nothing = clean( values, lowercase=False, replace_by_none=False, replace_by_whitespace=False, strip_accents=False, remove_brackets=False) # Check if ntohing happend. pdt.assert_series_equal(clean_series_nothing, values)
def test_clean_parameters(self): values = pd.Series( [u'Mary-ann', u'Bob :)', u'Angel', u'Bob (alias Billy)', u'Mary ann', u'John', np.nan ]) expected = pd.Series( [u'mary ann', u'bob', u'angel', u'bob', u'mary ann', u'john', np.nan]) clean_series = clean( values, lowercase=True, replace_by_none='[^ \-\_A-Za-z0-9]+', replace_by_whitespace='[\-\_]', remove_brackets=True ) # Check if series are identical. pdt.assert_series_equal(clean_series, expected)
def test_clean_unicode(self): values = pd.Series( [u'Mary-ann', u'Bob :)', u'Angel', u'Bob (alias Billy)', u'Mary ann', u'John', np.nan ]) expected = pd.Series( [u'mary ann', u'bob', u'angel', u'bob', u'mary ann', u'john', np.nan]) clean_series = clean(values) # Check if series are identical. pdt.assert_series_equal(clean_series, expected)
def test_clean_brackets(self): values = pd.Series([np.nan, 'bra(cke)ts', 'brackets with (brackets)']) expected = pd.Series([np.nan, 'brats', 'brackets with']) clean_series = clean(values, remove_brackets=True) # Check if series are identical. pdt.assert_series_equal(clean_series, expected)
def test_clean_lower(self): values = pd.Series([np.nan, 'LowerHigher', 'HIGHERLOWER']) expected = pd.Series([np.nan, 'lowerhigher', 'higherlower']) clean_series = clean(values, lower=True) # Check if series are identical. pdt.assert_series_equal(clean_series, expected)
def test_clean_lower(self): values = pd.Series([np.nan, 'LowerHigher', 'HIGHERLOWER']) expected = pd.Series([np.nan, 'lowerhigher', 'higherlower']) clean_series = clean(values, lowercase=True) # Check if series are identical. pdt.assert_series_equal(clean_series, expected)
def test_clean_accent_stripping(self): values = pd.Series(['ősdfésdfë', 'without']) expected = pd.Series(['osdfesdfe', 'without']) values_unicode = pd.Series([u'ősdfésdfë', u'without']) expected_unicode = pd.Series([u'osdfesdfe', u'without']) values_callable = pd.Series([u'ősdfésdfë', u'without']) expected_callable = pd.Series([u'ősdfésdfë', u'without']) # # Callable. # pdt.assert_series_equal( # clean(values_callable, strip_accents=lambda x: x), # expected_callable) # Check if series are identical. pdt.assert_series_equal(clean(values, strip_accents='unicode'), expected) # Check if series are identical. pdt.assert_series_equal(clean(values, strip_accents='ascii'), expected) # Check if series are identical. pdt.assert_series_equal(clean(values_unicode, strip_accents='unicode'), expected_unicode) # Check if series are identical. pdt.assert_series_equal(clean(values_unicode, strip_accents='ascii'), expected_unicode) with self.assertRaises(ValueError): clean(values, strip_accents='unknown_algorithm')
def test_clean_unicode(self): values = pd.Series([ u'Mary-ann', u'Bob :)', u'Angel', u'Bob (alias Billy)', u'Mary ann', u'John', np.nan ]) expected = pd.Series([ u'mary ann', u'bob', u'angel', u'bob', u'mary ann', u'john', np.nan ]) clean_series = clean(values) # Check if series are identical. pdt.assert_series_equal(clean_series, expected)
def test_clean(self): values = pd.Series([ 'Mary-ann', 'Bob :)', 'Angel', 'Bob (alias Billy)', 'Mary ann', 'John', np.nan ]) expected = pd.Series( ['mary ann', 'bob', 'angel', 'bob', 'mary ann', 'john', np.nan]) clean_series = clean(values) # Check if series are identical. pdt.assert_series_equal(clean_series, expected) clean_series_nothing = clean(values, lowercase=False, replace_by_none=False, replace_by_whitespace=False, strip_accents=False, remove_brackets=False) # Check if ntohing happend. pdt.assert_series_equal(clean_series_nothing, values)
def test_clean_parameters(self): values = pd.Series([ u'Mary-ann', u'Bob :)', u'Angel', u'Bob (alias Billy)', u'Mary ann', u'John', np.nan ]) expected = pd.Series([ u'mary ann', u'bob', u'angel', u'bob', u'mary ann', u'john', np.nan ]) clean_series = clean(values, lowercase=True, replace_by_none='[^ \-\_A-Za-z0-9]+', replace_by_whitespace='[\-\_]', remove_brackets=True) # Check if series are identical. pdt.assert_series_equal(clean_series, expected)
def test_clean_accent_stripping(self): values = pd.Series(['ősdfésdfë', 'without']) expected = pd.Series(['osdfesdfe', 'without']) values_unicode = pd.Series([u'ősdfésdfë', u'without']) expected_unicode = pd.Series([u'osdfesdfe', u'without']) values_callable = pd.Series([u'ősdfésdfë', u'without']) expected_callable = pd.Series([u'ősdfésdfë', u'without']) # # Callable. # pdt.assert_series_equal( # clean(values_callable, strip_accents=lambda x: x), # expected_callable) # Check if series are identical. pdt.assert_series_equal( clean(values, strip_accents='unicode'), expected) # Check if series are identical. pdt.assert_series_equal( clean(values, strip_accents='ascii'), expected) # Check if series are identical. pdt.assert_series_equal( clean(values_unicode, strip_accents='unicode'), expected_unicode) # Check if series are identical. pdt.assert_series_equal( clean(values_unicode, strip_accents='ascii'), expected_unicode) with self.assertRaises(ValueError): clean(values, strip_accents='unknown_algorithm')
def test_clean_empty(self): """ Test the cleaning of an empty Series""" # Check empty series pdt.assert_series_equal(clean(pd.Series()), pd.Series())