def get_inspect(self, n=10, offset=0, columns=None, wrap=inspect_settings._unspecified, truncate=inspect_settings._unspecified, round=inspect_settings._unspecified, width=inspect_settings._unspecified, margin=inspect_settings._unspecified, with_types=inspect_settings._unspecified): """Returns an ATable object representing the table inspect --see frame.inspect()""" from sparktk.frame.ops.take import take_rich format_settings = inspect_settings.copy(wrap, truncate, round, width, margin, with_types) result = take_rich(self, n, offset, columns) return ATable(result.data, result.schema, offset=offset, format_settings=format_settings)
def inspect(self, n=10, offset=0, columns=None, wrap=inspect_settings._unspecified, truncate=inspect_settings._unspecified, round=inspect_settings._unspecified, width=inspect_settings._unspecified, margin=inspect_settings._unspecified, with_types=inspect_settings._unspecified): """ Pretty-print of the frame data Essentially returns a string, but technically returns a RowInspection object which renders a string. The RowInspection object naturally converts to a str when needed, like when printed or when displayed by python REPL (i.e. using the object's __repr__). If running in a script and want the inspect output to be printed, then it must be explicitly printed, then `print frame.inspect()` Parameters ---------- :param n: (Optional[int]) The number of rows to print :param offset: (Optional[int]) The number of rows to skip before printing. :param columns: (Optional[List[str]]) Filter columns to be included. By default, all columns are included. :param wrap: (Optional[int or 'stripes']) If set to 'stripes' then inspect prints rows in stripes; if set to an integer N, rows will be printed in clumps of N columns, where the columns are wrapped. :param truncate: (Optional[int]) If set to integer N, all strings will be truncated to length N, including all tagged ellipses. :param round: (Optional[int]) If set to integer N, all floating point numbers will be rounded and truncated to N digits. :param width: (Optional[int]) If set to integer N, the print out will try to honor a max line width of N. :param margin: (Optional[int]) Applies to 'stripes' mode only. If set to integer N, the margin for printing names in a stripe will be limited to N characters. :param with_types: (Optinoal[bool]) If set to True, header will include the data_type of each column. :return: (RowsInspection) An object which naturally converts to a pretty-print string. Examples -------- To look at the first 4 rows of data in a frame: <skip> >>> frame.inspect(4) [#] animal name age weight ================================== [0] human George 8 542.5 [1] human Ursula 6 495.0 [2] ape Ape 41 400.0 [3] elephant Shep 5 8630.0 </skip> # For other examples, see :ref:`example_frame.inspect`. Note: if the frame data contains unicode characters, this method may raise a Unicode exception when running in an interactive REPL or otherwise which triggers the standard python repr(). To get around this problem, explicitly print the unicode of the returned object: <skip> >>> print unicode(frame.inspect()) </skip> **Global Settings** If not specified, the arguments that control formatting receive default values from 'sparktk.inspect_settings'. Make changes there to affect all calls to inspect. >>> import sparktk >>> sparktk.inspect_settings wrap 20 truncate None round None width 80 margin None with_types False >>> sparktk.inspect_settings.width = 120 # changes inspect to use 120 width globally >>> sparktk.inspect_settings.truncate = 16 # changes inspect to always truncate strings to 16 chars >>> sparktk.inspect_settings wrap 20 truncate 16 round None width 120 margin None with_types False >>> sparktk.inspect_settings.width = None # return value back to default >>> sparktk.inspect_settings wrap 20 truncate 16 round None width 80 margin None with_types False >>> sparktk.inspect_settings.reset() # set everything back to default >>> sparktk.inspect_settings wrap 20 truncate None round None width 80 margin None with_types False """ from sparktk.frame.ops.take import take_rich format_settings = inspect_settings.copy(wrap, truncate, round, width, margin, with_types) result = take_rich(self, n, offset, columns) return RowsInspection(result.data, result.schema, offset=offset, format_settings=format_settings)
def to_pandas(self, n=None, offset=0, columns=None): """ Brings data into a local pandas dataframe. Similar to the 'take' function, but puts the data into a pandas dataframe. Parameters ---------- :param n: (Optional(int)) The number of rows to get from the frame (warning: do not overwhelm the python session by taking too much) :param offset: (Optional(int)) The number of rows to skip before copying. Defaults to 0. :param columns: (Optional(List[str])) Column filter. The list of names to be included. Default is all columns. :return: (pandas.DataFrame) A new pandas dataframe object containing the taken frame data. Examples -------- <hide> >>> data = [["Fred", "555-1234"],["Susan", "555-0202"],["Thurston","555-4510"],["Judy","555-2183"]] >>> column_names = ["name", "phone"] >>> frame = tc.frame.create(data, column_names) </hide> Consider the following spark-tk frame, where we have columns for name and phone number: >>> frame.inspect() [#] name phone ======================= [0] Fred 555-1234 [1] Susan 555-0202 [2] Thurston 555-4510 [3] Judy 555-2183 >>> frame.schema [('name', <type 'str'>), ('phone', <type 'str'>)] The frame to_pandas() method is used to get a pandas DataFrame that contains the data from the spark-tk frame. Note that since no parameters are provided when to_pandas() is called, the default values are used for the number of rows, the row offset, and the columns. >>> pandas_frame = frame.to_pandas() >>> pandas_frame name phone 0 Fred 555-1234 1 Susan 555-0202 2 Thurston 555-4510 3 Judy 555-2183 """ try: import pandas except: raise RuntimeError( "pandas module not found, unable to download. Install pandas or try the take command." ) from sparktk.frame.ops.take import take_rich result = take_rich(self, n, offset, columns) headers, data_types = zip(*result.schema) frame_data = result.data from sparktk import dtypes import datetime date_time_columns = [ i for i, x in enumerate(self.schema) if x[1] in (dtypes.datetime, datetime.datetime) ] has_date_time = len(date_time_columns) > 0 # translate our datetime long to datetime, so that it gets into the pandas df as a datetime column def long_to_date_time(row): for i in date_time_columns: if isinstance(row[i], long): row[i] = datetime.datetime.fromtimestamp( row[i] // 1000).replace(microsecond=row[i] % 1000 * 1000) return row if (has_date_time): frame_data = map(long_to_date_time, frame_data) # create pandas df pandas_df = pandas.DataFrame(frame_data, columns=headers) for i, dtype in enumerate(data_types): dtype_str = _sparktk_dtype_to_pandas_str(dtype) try: pandas_df[[headers[i]]] = pandas_df[[headers[i]]].astype(dtype_str) except (TypeError, ValueError): if dtype_str.startswith("int"): # DataFrame does not handle missing values in int columns. If we get this error, use the 'object' datatype instead. print "WARNING - Encountered problem casting column %s to %s, possibly due to missing values (i.e. presence of None). Continued by casting column %s as 'object'" % ( headers[i], dtype_str, headers[i]) pandas_df[[headers[i]]] = pandas_df[[headers[i] ]].astype("object") else: raise return pandas_df
def to_pandas(self, n=None, offset=0, columns=None): """ Brings data into a local pandas dataframe. Similar to the 'take' function, but puts the data into a pandas dataframe. Parameters ---------- :param n: (Optional(int)) The number of rows to get from the frame (warning: do not overwhelm the python session by taking too much) :param offset: (Optional(int)) The number of rows to skip before copying. Defaults to 0. :param columns: (Optional(List[str])) Column filter. The list of names to be included. Default is all columns. :return: (pandas.DataFrame) A new pandas dataframe object containing the taken frame data. Examples -------- <hide> >>> data = [["Fred", "555-1234"],["Susan", "555-0202"],["Thurston","555-4510"],["Judy","555-2183"]] >>> column_names = ["name", "phone"] >>> frame = tc.frame.create(data, column_names) </hide> Consider the following spark-tk frame, where we have columns for name and phone number: >>> frame.inspect() [#] name phone ======================= [0] Fred 555-1234 [1] Susan 555-0202 [2] Thurston 555-4510 [3] Judy 555-2183 >>> frame.schema [('name', <type 'str'>), ('phone', <type 'str'>)] The frame to_pandas() method is used to get a pandas DataFrame that contains the data from the spark-tk frame. Note that since no parameters are provided when to_pandas() is called, the default values are used for the number of rows, the row offset, and the columns. >>> pandas_frame = frame.to_pandas() >>> pandas_frame name phone 0 Fred 555-1234 1 Susan 555-0202 2 Thurston 555-4510 3 Judy 555-2183 """ try: import pandas except: raise RuntimeError("pandas module not found, unable to download. Install pandas or try the take command.") from sparktk.frame.ops.take import take_rich result = take_rich(self, n, offset, columns) headers, data_types = zip(*result.schema) frame_data = result.data from sparktk import dtypes import datetime date_time_columns = [i for i, x in enumerate(self.schema) if x[1] in (dtypes.datetime, datetime.datetime)] has_date_time = len(date_time_columns) > 0 # translate our datetime long to datetime, so that it gets into the pandas df as a datetime column def long_to_date_time(row): for i in date_time_columns: if isinstance(row[i], long): row[i] = datetime.datetime.fromtimestamp(row[i]//1000).replace(microsecond=row[i]%1000*1000) return row if (has_date_time): frame_data = map(long_to_date_time, frame_data) # create pandas df pandas_df = pandas.DataFrame(frame_data, columns=headers) for i, dtype in enumerate(data_types): dtype_str = _sparktk_dtype_to_pandas_str(dtype) try: pandas_df[[headers[i]]] = pandas_df[[headers[i]]].astype(dtype_str) except (TypeError, ValueError): if dtype_str.startswith("int"): # DataFrame does not handle missing values in int columns. If we get this error, use the 'object' datatype instead. print "WARNING - Encountered problem casting column %s to %s, possibly due to missing values (i.e. presence of None). Continued by casting column %s as 'object'" % (headers[i], dtype_str, headers[i]) pandas_df[[headers[i]]] = pandas_df[[headers[i]]].astype("object") else: raise return pandas_df