def test_max_len_string_array(self): arr = a = np.array(['foo', 'b', np.nan], dtype='object') assert lib.max_len_string_array(arr), 3 # unicode arr = a.astype('U').astype(object) assert lib.max_len_string_array(arr), 3 # bytes for python3 arr = a.astype('S').astype(object) assert lib.max_len_string_array(arr), 3 # raises pytest.raises(TypeError, lambda: lib.max_len_string_array(arr.astype('U')))
def test_max_len_string_array(self): arr = a = np.array(['foo', 'b', np.nan], dtype='object') assert lib.max_len_string_array(arr) == 3 # unicode arr = a.astype('U').astype(object) assert lib.max_len_string_array(arr) == 3 # bytes for python3 arr = a.astype('S').astype(object) assert lib.max_len_string_array(arr) == 3 # raises pytest.raises(TypeError, lambda: lib.max_len_string_array(arr.astype('U')))
def test_max_len_string_array(self): arr = a = np.array(['foo', 'b', np.nan], dtype='object') self.assertTrue(lib.max_len_string_array(arr), 3) # unicode arr = a.astype('U').astype(object) self.assertTrue(lib.max_len_string_array(arr), 3) # bytes for python3 arr = a.astype('S').astype(object) self.assertTrue(lib.max_len_string_array(arr), 3) # raises tm.assertRaises(TypeError, lambda: lib.max_len_string_array(arr.astype('U')))
def _convert_string_array(dt, encoding, itemsize=None): from pandas._libs import lib from pandas.core.dtypes.common import _ensure_object if dt.dtype.name == 'object': # encode if needed if encoding is not None and len(dt): dt = pd.Series(dt.ravel()).str.encode(encoding).values.reshape( dt.shape) # create the sized dtype if itemsize is None: itemsize = lib.max_len_string_array(_ensure_object(dt.ravel())) dt = np.asarray(dt, dtype="S%d" % itemsize) return dt else: return dt
def _convert_types(self, a): """ Converts object arrays of strings to numpy string arrays """ # No conversion for scalar type if a.dtype != 'object': return a, None # We can't infer the type of an empty array, so just # assume strings if len(a) == 0: return a.astype('U1'), None # Compute a mask of missing values. Replace NaNs and Nones with # empty strings so that type inference has a chance. mask = pd.isnull(a) if mask.sum() > 0: a = a.copy() np.putmask(a, mask, '') else: mask = None if infer_dtype(a) == 'mixed': # assume its a string, otherwise raise an error try: a = np.array([s.encode('ascii') for s in a]) a = a.astype('O') except: raise ValueError( "Column of type 'mixed' cannot be converted to string") type_ = infer_dtype(a) if type_ in ['unicode', 'string']: max_len = max_len_string_array(a) return a.astype('U{:d}'.format(max_len)), mask else: raise ValueError('Cannot store arrays with {} dtype'.format(type_))
def _convert_types(self, a): """ Converts object arrays of strings to numpy string arrays """ # No conversion for scalar type if a.dtype != 'object': return a, None # We can't infer the type of an empty array, so just # assume strings if len(a) == 0: return a.astype('U1'), None # Compute a mask of missing values. Replace NaNs and Nones with # empty strings so that type inference has a chance. mask = pd.isnull(a) if mask.sum() > 0: a = a.copy() np.putmask(a, mask, '') else: mask = None if infer_dtype(a) == 'mixed': # assume its a string, otherwise raise an error try: a = np.array([s.encode('ascii') for s in a]) a = a.astype('O') except: raise ValueError("Column of type 'mixed' cannot be converted to string") type_ = infer_dtype(a) if type_ in ['unicode', 'string']: max_len = max_len_string_array(a) return a.astype('U{:d}'.format(max_len)), mask else: raise ValueError('Cannot store arrays with {} dtype'.format(type_))