def dataframe_to_thrift_struct(df, name, roffset, coffset, rows, cols, format): """ :type df: pandas.core.frame.DataFrame :type name: str :type coffset: int :type roffset: int :type rows: int :type cols: int :type format: str """ dim = len(df.axes) num_rows = df.shape[0] num_cols = df.shape[1] if dim > 1 else 1 array_chunk = GetArrayResponse() array_chunk.slice = name array_chunk.rows = num_rows array_chunk.cols = num_cols array_chunk.format = "" array_chunk.type = "" array_chunk.max = "0" array_chunk.min = "0" if (rows, cols) == (-1, -1): rows, cols = num_rows, num_cols rows = min(rows, MAXIMUM_ARRAY_SIZE) cols = min(cols, MAXIMUM_ARRAY_SIZE, num_cols) # need to precompute column bounds here before slicing! col_bounds = [None] * cols dtypes = [None] * cols if dim > 1: for col in range(cols): dtype = df.dtypes.iloc[coffset + col].kind dtypes[col] = dtype if dtype in "biufc": cvalues = df.iloc[:, coffset + col] bounds = (cvalues.min(), cvalues.max()) else: bounds = (0, 0) col_bounds[col] = bounds else: dtype = df.dtype.kind dtypes[0] = dtype col_bounds[0] = (df.min(), df.max()) if dtype in "biufc" else (0, 0) df = df.iloc[roffset: roffset + rows, coffset: coffset + cols] if dim > 1 else df.iloc[roffset: roffset + rows] rows = df.shape[0] cols = df.shape[1] if dim > 1 else 1 format = format.replace('%', '') def col_to_format(c): return format if dtypes[c] == 'f' and format else array_default_format(dtypes[c]) array_chunk.headers = header_data_to_thrift_struct(rows, cols, dtypes, col_bounds, col_to_format, df, dim) array_chunk.data = array_data_to_thrift_struct(rows, cols, lambda r: (("%" + col_to_format(c)) % (df.iat[r, c] if dim > 1 else df.iat[r]) for c in range(cols))) return array_chunk
def dataframe_to_thrift_struct(df, name, roffset, coffset, rows, cols, format): """ :type df: pandas.core.frame.DataFrame :type name: str :type coffset: int :type roffset: int :type rows: int :type cols: int :type format: str """ dim = len(df.axes) num_rows = df.shape[0] num_cols = df.shape[1] if dim > 1 else 1 array_chunk = GetArrayResponse() array_chunk.slice = name array_chunk.rows = num_rows array_chunk.cols = num_cols array_chunk.format = "" array_chunk.type = "" array_chunk.max = "0" array_chunk.min = "0" if (rows, cols) == (-1, -1): rows, cols = num_rows, num_cols rows = min(rows, MAXIMUM_ARRAY_SIZE) cols = min(cols, MAXIMUM_ARRAY_SIZE, num_cols) # need to precompute column bounds here before slicing! col_bounds = [None] * cols dtypes = [None] * cols if dim > 1: for col in range(cols): dtype = df.dtypes.iloc[coffset + col].kind dtypes[col] = dtype if dtype in "biufc": cvalues = df.iloc[:, coffset + col] bounds = (cvalues.min(), cvalues.max()) else: bounds = (0, 0) col_bounds[col] = bounds else: dtype = df.dtype.kind dtypes[0] = dtype col_bounds[0] = (df.min(), df.max()) if dtype in "biufc" else (0, 0) df = df.iloc[roffset:roffset + rows, coffset:coffset + cols] if dim > 1 else df.iloc[roffset:roffset + rows] rows = df.shape[0] cols = df.shape[1] if dim > 1 else 1 format = format.replace('%', '') def col_to_format(c): return format if dtypes[c] == 'f' and format else array_default_format( dtypes[c]) array_chunk.headers = header_data_to_thrift_struct(rows, cols, dtypes, col_bounds, col_to_format, df, dim) array_chunk.data = array_data_to_thrift_struct( rows, cols, lambda r: (("%" + col_to_format(c)) % (df.iat[r, c] if dim > 1 else df.iat[r]) for c in range(cols))) return array_chunk
def array_to_meta_thrift_struct(array, name, format): type = array.dtype.kind slice = name l = len(array.shape) # initial load, compute slice if format == '%': if l > 2: slice += '[0]' * (l - 2) for r in range(l - 2): array = array[0] if type == 'f': format = '.5f' elif type == 'i' or type == 'u': format = 'd' else: format = 's' else: format = format.replace('%', '') l = len(array.shape) reslice = "" if l > 2: raise Exception("%s has more than 2 dimensions." % slice) elif l == 1: # special case with 1D arrays arr[i, :] - row, but arr[:, i] - column with equal shape and ndim # http://stackoverflow.com/questions/16837946/numpy-a-2-rows-1-column-file-loadtxt-returns-1row-2-columns # explanation: http://stackoverflow.com/questions/15165170/how-do-i-maintain-row-column-orientation-of-vectors-in-numpy?rq=1 # we use kind of a hack - get information about memory from C_CONTIGUOUS is_row = array.flags['C_CONTIGUOUS'] if is_row: rows = 1 cols = len(array) if cols < len(array): reslice = '[0:%s]' % (cols) array = array[0:cols] else: cols = 1 rows = len(array) if rows < len(array): reslice = '[0:%s]' % (rows) array = array[0:rows] elif l == 2: rows = array.shape[-2] cols = array.shape[-1] if cols < array.shape[-1] or rows < array.shape[-2]: reslice = '[0:%s, 0:%s]' % (rows, cols) array = array[0:rows, 0:cols] # avoid slice duplication if not slice.endswith(reslice): slice += reslice bounds = (0, 0) if type in "biufc": bounds = (array.min(), array.max()) array_chunk = GetArrayResponse() array_chunk.slice = slice array_chunk.rows = rows array_chunk.cols = cols array_chunk.format = format array_chunk.type = type array_chunk.max = "%s" % bounds[1] array_chunk.min = "%s" % bounds[0] return array, array_chunk, rows, cols, format
def dataframe_to_thrift_struct(df, name, roffset, coffset, rows, cols, format): """ :type df: pandas.core.frame.DataFrame :type name: str :type coffset: int :type roffset: int :type rows: int :type cols: int :type format: str """ dim = len(df.axes) num_rows = df.shape[0] num_cols = df.shape[1] if dim > 1 else 1 array_chunk = GetArrayResponse() array_chunk.slice = name array_chunk.rows = num_rows array_chunk.cols = num_cols array_chunk.type = "" array_chunk.max = "0" array_chunk.min = "0" format = format.replace("%", "") if not format: if num_rows > 0 and num_cols == 1: # series or data frame with one column try: kind = df.dtype.kind except AttributeError: try: kind = df.dtypes[0].kind except IndexError: kind = "O" format = array_default_format(kind) else: format = array_default_format("f") array_chunk.format = "%" + format if (rows, cols) == (-1, -1): rows, cols = num_rows, num_cols rows = min(rows, MAXIMUM_ARRAY_SIZE) cols = min(cols, MAXIMUM_ARRAY_SIZE, num_cols) # need to precompute column bounds here before slicing! col_bounds = [None] * cols dtypes = [None] * cols if dim > 1: for col in range(cols): dtype = df.dtypes.iloc[coffset + col].kind dtypes[col] = dtype if dtype in NUMPY_NUMERIC_TYPES: cvalues = df.iloc[:, coffset + col] bounds = (cvalues.min(), cvalues.max()) else: bounds = (0, 0) col_bounds[col] = bounds else: dtype = df.dtype.kind dtypes[0] = dtype col_bounds[0] = (df.min(), df.max()) if dtype in NUMPY_NUMERIC_TYPES else (0, 0) df = df.iloc[roffset: roffset + rows, coffset: coffset + cols] if dim > 1 else df.iloc[roffset: roffset + rows] rows = df.shape[0] cols = df.shape[1] if dim > 1 else 1 def col_to_format(c): return format if dtypes[c] in NUMPY_NUMERIC_TYPES and format else array_default_format(dtypes[c]) iat = df.iat if dim == 1 or len(df.columns.unique()) == len(df.columns) else df.iloc array_chunk.headers = header_data_to_thrift_struct(rows, cols, dtypes, col_bounds, col_to_format, df, dim) array_chunk.data = array_data_to_thrift_struct(rows, cols, lambda r: (("%" + col_to_format(c)) % (iat[r, c] if dim > 1 else iat[r]) for c in range(cols)), format) return array_chunk