def __init__(self, tc, source_or_vertices_frame, edges_frame=None): self._tc = tc self._scala = None # (note that the Scala code will validate appropriate frame schemas) if isinstance(source_or_vertices_frame, Frame): # Python Vertices and Edges Frames vertices_frame = source_or_vertices_frame require_type(Frame, edges_frame, 'edges_frame', "Providing a vertices frame requires also providing an edges frame") self._scala = self._create_scala_graph_from_scala_frames(self._tc, vertices_frame._scala, edges_frame._scala) else: source = source_or_vertices_frame require_type(None, edges_frame, 'edges_frame', 'If edges_frames is provided, then a valid vertex frame must be provided as the first arg, instead of type %s' % type(source)) if self._is_scala_graph(source): # Scala Graph self._scala = source elif isinstance(source, GraphFrame): # python GraphFrame scala_graphframe = source._jvm_graph self._scala = self._create_scala_graph_from_scala_graphframe(self._tc, scala_graphframe) elif self._is_scala_graphframe(source): # scala GraphFrame self._scala = self._create_scala_graph_from_scala_graphframe(self._tc, source) else: raise TypeError("Cannot create from source type %s" % type(source))
def recommend(self, entity_id, number_of_recommendations=1, recommend_products=True): """ recommend products to users or vice versa :param entity_id: (int) A user/product id :param number_of_recommendations: (int) Number of recommendations :param recommend_products: (bool) True - products for user; false - users for the product :return: Returns an array of recommendations (as array of csv-strings) """ require_type(int, entity_id, "entity_id") require_type.non_negative_int(number_of_recommendations, "number_of_recommendations") require_type(bool, recommend_products, "recommend_products") # returns scala list of scala map scala_list_of_scala_map = self._scala.recommend( entity_id, number_of_recommendations, recommend_products) # First convert to python list of scala map python_list_of_scala_map = self._tc.jutils.convert.from_scala_seq( scala_list_of_scala_map) # Convert to Python list of python map python_list_of_python_map = [] for scala_map in python_list_of_scala_map: python_list_of_python_map.append( self._tc.jutils.convert.scala_map_to_python(scala_map)) return python_list_of_python_map
def recommend(self, entity_id, number_of_recommendations=1, recommend_products=True): """ recommend products to users or vice versa :param entity_id: (int) A user/product id :param number_of_recommendations: (int) Number of recommendations :param recommend_products: (bool) True - products for user; false - users for the product :return: Returns an array of recommendations (as array of csv-strings) """ require_type(int, entity_id, "entity_id") require_type.non_negative_int(number_of_recommendations, "number_of_recommendations") require_type(bool, recommend_products, "recommend_products") # returns scala list of scala map scala_list_of_scala_map = self._scala.recommend(entity_id, number_of_recommendations, recommend_products) # First convert to python list of scala map python_list_of_scala_map = self._tc.jutils.convert.from_scala_seq(scala_list_of_scala_map) # Convert to Python list of python map python_list_of_python_map = [] for scala_map in python_list_of_scala_map: python_list_of_python_map.append(self._tc.jutils.convert.scala_map_to_python(scala_map)) return python_list_of_python_map
def test_implicit(self): try: require_type(int, implicit, "a") except ValueError as e: self.assertEqual("Missing value for arg 'a'. This value is normally filled implicitly, however, if this method is called standalone, it must be set explicitly", str(e)) else: self.fail("A ValueError should have been raised")
def __init__(self, tc, source_or_vertices_frame, edges_frame=None): self._tc = tc self._scala = None # (note that the Scala code will validate appropriate frame schemas) if isinstance(source_or_vertices_frame, Frame): # Python Vertices and Edges Frames vertices_frame = source_or_vertices_frame require_type(edges_frame, 'edges_frame', Frame, "Providing a vertices frame requires also providing an edges frame") self._scala = self.create_scala_graph_from_scala_frames(self._tc, vertices_frame._scala, edges_frame._scala) else: source = source_or_vertices_frame require_type(edges_frame, 'edges_frame', None, 'If edges_frames is provided, then a valid vertex frame must be provided as the first arg, instead of type %s' % type(source)) if self._is_scala_graph(source): # Scala Graph self._scala = source elif isinstance(source, GraphFrame): # python GraphFrame scala_graphframe = source._jvm_graph self._scala = self.create_scala_graph_from_scala_graphframe(self._tc, scala_graphframe) elif self._is_scala_graphframe(source): # scala GraphFrame self._scala = self.create_scala_graph_from_scala_graphframe(self._tc, source) else: raise TypeError("Cannot create from source type %s" % type(source))
def validate(tc, arg_name='tc'): """ Raises a ValueError if the tc variable is not of type TkContext Since tc is so commonly used as an implicit variable, it's worth the special code here to save a lot of imports otherwise """ require_type(tc, arg_name, TkContext)
def test_basic_negative(self): try: require_type(int, "12", "a") except TypeError as e: msg = str(e) expected = "Expected type <type 'int'>" self.assertTrue(expected in msg, "\nexpected=%s\nmessage =%s" % (expected, msg)) else: self.fail("A TypeError should have been raised")
def test_implicit(self): try: require_type(int, implicit, "a") except ValueError as e: self.assertEqual( "Missing value for arg 'a'. This value is normally filled implicitly, however, if this method is called standalone, it must be set explicitly", str(e)) else: self.fail("A ValueError should have been raised")
def export_to_hive(self, hive_table_name, overwrite=False): """ Write current frame to Hive table. Table must not exist in Hive. Hive does not support case sensitive table names and columns names. Hence column names with uppercase letters will be converted to lower case by Hive. Parameters ---------- :param hive_table_name: (str) hive table name :param overwrite: (Optional(bool)) Specify whether or not to overwrite the hive table if it already exists. If overwrite is set to False, and the table already exists, an exception is thrown. Example -------- <skip> >>> data = [[1, 0.2, -2, 5], [2, 0.4, -1, 6], [3, 0.6, 0, 7], [4, 0.8, 1, 8]] >>> schema = [('a', int), ('b', float),('c', int) ,('d', int)] >>> my_frame = tc.frame.create(data, schema) <progress> </skip> table_name: (string): table name. It will create new table with given name if it does not exists already. <skip> >>> my_frame.export_to_hive("demo_test_hive") <progress> </skip> Verify exported frame in hive From bash shell $hive hive> show tables You should see demo_test_hive table. Run hive> select * from demo_test_hive; (to verify frame). To overwrite a table that already exists, set the overwrite parameter to 'True': <skip> >>> my_frame.export_to_hive("demo_test_hive", overwrite=True) </skip> """ require_type.non_empty_str(hive_table_name, "hive_table_name") require_type(bool, overwrite, "overwrite") self._scala.exportToHive(hive_table_name, overwrite)
def test(self, frame, observation_columns=None, label_column=None): """ Test the frame given the trained model Parameters ---------- :param frame: (Frame) The frame to predict on :param observation_columns: Optional(List[str]) List of column(s) containing the observations :param label_column: Optional(String) Column name containing the label for each observation :return: (RegressionTestMetrics) RegressionTestMetrics object consisting of results from model test """ require_type(Frame, frame, 'frame') column_list = affirm_type.list_of_str(observation_columns, "observation_columns", allow_none=True) obs = self._tc.jutils.convert.to_scala_option_list_string(column_list) label = self._tc.jutils.convert.to_scala_option(label_column) return RegressionTestMetrics(self._scala.test(frame._scala, obs, label))
def expand_kwarg_grids(dictionaries): """ Method to expand the dictionary of arguments :param dictionaries: Parameters for the model of type (list of dict) :return: Expanded list of parameters for the model """ arguments.require_type(list, dictionaries, "dictionaries") new_dictionaries = [] for dictionary in dictionaries: for k, v in dictionary.items(): arguments.require_type(dict, dictionary, "item in dictionaries") if isinstance(v, GridValues): for a in v.args: d = dictionary.copy() d[k] = a new_dictionaries.append(d) break if new_dictionaries: return expand_kwarg_grids(new_dictionaries) return dictionaries
def train(frame, time_column, covariate_columns, censor_column, convergence_tolerance=1E-6, max_steps=100): """ Creates a CoxProportionalHazardsModel by training on the given frame Parameters ---------- :param frame: (Frame) A frame to train the model on :param time_column: (str) Column name containing the time of occurence of each observation. :param covariate_columns: (Seq[str]) List of column(s) containing the covariates. :param censor_column: (str) Column name containing censor value of each observation. :param convergence_tolerance: (float) Parameter for the convergence tolerance for iterative algorithms. Default is 1E-6 :param max_steps: (int) Parameter for maximum number of steps. Default is 100 :return: (CoxProportionalHazardsModel) A trained coxPh model """ from sparktk.frame.frame import Frame require_type(Frame, frame, "frame cannot be None") require_type.non_empty_str(time_column, "time_column") require_type.non_empty_str(censor_column, "censor_column") require_type(float, convergence_tolerance, "convergence_tolerance should be float") require_type.non_negative_int(max_steps, "max_steps") affirm_type.list_of_str(covariate_columns, "covariate_columns") tc = frame._tc _scala_obj = get_scala_obj(tc) scala_covariate_columns = tc.jutils.convert.to_scala_vector_string(covariate_columns) scala_model = _scala_obj.train(frame._scala, time_column, scala_covariate_columns, censor_column, convergence_tolerance, max_steps) return CoxProportionalHazardsModel(tc, scala_model)
def predict(self, frame, observation_columns=None): """ Predict the values for the data points. Predict the values for a test frame using trained Random Forest Classifier model, and create a new frame revision with existing columns and a new predicted value's column. Parameters ---------- :param frame: (Frame) A frame whose labels are to be predicted. By default, predict is run on the same columns over which the model is trained. :param observation_columns: (Optional(list[str])) Column(s) containing the observations whose labels are to be predicted. By default, we predict the labels over columns the Random Forest model was trained on. :return: (Frame) A new frame consisting of the existing columns of the frame and a new column with predicted value for each observation. """ require_type(Frame, frame, 'frame') column_list = affirm_type.list_of_str(observation_columns, "observation_columns", allow_none=True) columns_option = self._tc.jutils.convert.to_scala_option_list_string(column_list) return Frame(self._tc, self._scala.predict(frame._scala, columns_option))
def validate(tc, arg_name='tc'): """ Validates that the given tc object is indeed a TkContext. Raises a ValueError if it is not. Examples -------- <hide> >>> from sparktk import TkContext </hide> >>> TkContext.validate(tc) >>> try: ... TkContext(25) ... except TypeError: ... print "Not a TkContext!" Not a TkContext! """ # Since tc is so commonly used as an implicit variable, it's worth special code here to save a lot of imports require_type(TkContext, tc, arg_name)
def test(self, frame, observation_columns=None, label_column=None): """ Predict test frame labels and return metrics. Parameters ---------- :param frame: (Frame) The frame whose labels are to be predicted :param observation_columns: (Optional(list[str])) Column(s) containing the observations whose labels are to be predicted. By default, the same observation column names from training are used :param label_column: (str) Column containing the name of the label By default, the same label column name from training is used :return: (ClassificationMetricsValue) Binary classification metrics comprised of: accuracy (double) The proportion of predictions that are correctly identified confusion_matrix (dictionary) A table used to describe the performance of a classification model f_measure (double) The harmonic mean of precision and recall precision (double) The proportion of predicted positive instances that are correctly identified recall (double) The proportion of positive instances that are correctly identified. """ require_type(Frame, frame, 'frame') column_list = affirm_type.list_of_str(observation_columns, "observation_columns", allow_none=True) return ClassificationMetricsValue( self._tc, self._scala.test( frame._scala, self._tc.jutils.convert.to_scala_option_list_string( column_list), self._tc.jutils.convert.to_scala_option(label_column)))
def export_to_jdbc(self, connection_url, table_name, overwrite=False): """ Write current frame to JDBC table Parameters ---------- :param connection_url: (str) JDBC connection url to database server :param table_name: (str) JDBC table name :param overwrite: (Optional(bool)) Specify whether or not to overwrite the existing table, if one already exists with the the same name. If overwrite is set to False and a table with the same name already exists, an exception is thrown. Example ------- <skip> >>> from sparktk import TkContext >>> c=TkContext(sc) >>> data = [[1, 0.2, -2, 5], [2, 0.4, -1, 6], [3, 0.6, 0, 7], [4, 0.8, 1, 8]] >>> schema = [('a', int), ('b', float),('c', int) ,('d', int)] >>> my_frame = tc.frame.create(data, schema) <progress> </skip> connection_url : (string) : "jdbc:{datasbase_type}://{host}/{database_name} Sample connection string for postgres ex: jdbc:postgresql://localhost/postgres [standard connection string to connect to default 'postgres' database] table_name: (string): table name. It will create new table with given name if it does not exists already. <skip> >>> my_frame.export_to_jdbc("jdbc:postgresql://localhost/postgres", "demo_test") <progress> </skip> Verify exported frame in postgres From bash shell $sudo -su ppostgres psql postgres=#\d You should see demo_test table. Run postgres=#select * from demo_test (to verify frame). Notes ----- java.sql.SQLException: No suitable driver found for <jdbcUrl> If this error is encountered while running your application, then your JDBC library cannot be found by the node running the application. If you're running in Local mode, make sure that you have used the --driver-class-path parameter. If a Spark cluster is involved, make sure that each cluster member has a copy of library, and that each node of the cluster has been restarted since you modified the spark-defaults.conf file. See this [site](https://sparkour.urizone.net/recipes/using-jdbc/). Sparktk does not come with any JDBC drivers. A driver compatible with the JDBC data sink must be supplied when creating the TkContext instance: <skip> >>> tc = sparktk.TkContext(pyspark_submit_args='--jars myJDBCDriver.jar') </skip> """ require_type.non_empty_str(connection_url, "connection_url") require_type.non_empty_str(table_name, "table_name") require_type(bool, overwrite, "overwrite") self._scala.exportToJdbc(connection_url, table_name, overwrite)
def import_csv_raw(path, delimiter=",", header=False, tc=TkContext.implicit): """ Creates a frame by importing the data as strings from the specified csv file. If the csv file has a header row, those values will be used as column names. Otherwise, columns will be named generically, like 'C0', 'C1', 'C2', etc. Parameters ---------- :param path: (str) Full path to the csv file :param delimiter: (str) A string which indicates the separation of data fields. This is usually a single character and could be a non-visible character, such as a tab. The default delimiter is a comma (,). :param header: (bool) Boolean value indicating if the first line of the file will be used to name columns, and not be included in the data. The default value is false. :return: (Frame) Frame that contains the data from the csv file Examples -------- Import raw data from a csv file by specifying the path to the file, delimiter, and header option. All data will be brought in the frame as strings, and columns will be named according to the header row, if there was one. >>> file_path = "../datasets/cities.csv" >>> frame = tc.frame.import_csv_raw(file_path, delimiter="|", header=True) -etc- >>> frame.inspect() [#] rank city population_2013 population_2010 change county ============================================================================ [0] 1 Portland 609456 583776 4.40% Multnomah [1] 2 Salem 160614 154637 3.87% Marion [2] 3 Eugene 159190 156185 1.92% Lane [3] 4 Gresham 109397 105594 3.60% Multnomah [4] 5 Hillsboro 97368 91611 6.28% Washington [5] 6 Beaverton 93542 89803 4.16% Washington [6] 15 Grants Pass 35076 34533 1.57% Josephine [7] 16 Oregon City 34622 31859 8.67% Clackamas [8] 17 McMinnville 33131 32187 2.93% Yamhill [9] 18 Redmond 27427 26215 4.62% Deschutes >>> frame.schema [('rank', <type 'str'>), ('city', <type 'str'>), ('population_2013', <type 'str'>), ('population_2010', <type 'str'>), ('change', <type 'str'>), ('county', <type 'str'>)] """ TkContext.validate(tc) require_type.non_empty_str(path, "path") require_type.non_empty_str(delimiter, "delimiter") require_type(bool, header, "header") df = tc.sql_context.read.format( "com.databricks.spark.csv.org.trustedanalytics.sparktk").options( delimiter=delimiter, header=str(header).lower(), inferschema="false").load(path, schema=None) df_schema = [] for column in df.schema.fields: try: datatype = dtypes.dtypes.get_primitive_type_from_pyspark_type( type(column.dataType)) except ValueError: raise TypeError( "Unsupported data type ({0}) for column {1}.".format( str(column.dataType), column.name)) df_schema.append((column.name, datatype)) jrdd = tc.sc._jvm.org.trustedanalytics.sparktk.frame.internal.rdd.PythonJavaRdd.scalaToPython( df._jdf.rdd()) rdd = RDD(jrdd, tc.sc) from sparktk.frame.frame import Frame # circular dependency, so import late return Frame(tc, rdd, df_schema)
def export_to_tensorflow(self, path, overwrite=False): """ Export frame to TensorFlow Records file on given path TensorFlow records are the standard data format for TensorFlow. The recommended format for TensorFlow is a TFRecords file containing tf.train.Example protocol buffers. The tf.train.Example protocol buffers encodes (which contain Features as a field). https://www.tensorflow.org/how_tos/reading_data During export, the API parses Spark SQL DataTypes to TensorFlow compatible DataTypes as below: * IntegerType or LongType => Int64List * FloatType or DoubleType => FloatList * ArrayType(Double) [Vector] => FloatList * Any other DataType (Ex: String) => BytesList Parameters ---------- :param path: (str) HDFS/Local path to export current frame as TensorFlow records :param overwrite: (Optional[bool]) Specify whether or not to overwrite the existing file, if a file already exists at the specified path. If overwrite is set to False, and a file already exists, an exception is thrown. Examples -------- >>> file_path = "../datasets/cities.csv" >>> frame = tc.frame.import_csv(file_path, "|", header=True) -etc- >>> frame.sort("rank") >>> frame.inspect() [#] rank city population_2013 population_2010 change county ============================================================================ [0] 1 Portland 609456 583776 4.40% Multnomah [1] 2 Salem 160614 154637 3.87% Marion [2] 3 Eugene 159190 156185 1.92% Lane [3] 4 Gresham 109397 105594 3.60% Multnomah [4] 5 Hillsboro 97368 91611 6.28% Washington [5] 6 Beaverton 93542 89803 4.16% Washington [6] 7 Bend 81236 76639 6.00% Deschutes [7] 8 Medford 77677 74907 3.70% Jackson [8] 9 Springfield 60177 59403 1.30% Lane [9] 10 Corvallis 55298 54462 1.54% Benton >>> destPath = "../tests/sandbox/output24.tfr" >>> import os ... if os.path.exists(filename) os.remove(destPath) >>> frame.export_to_tensorflow(destPath) Check for output24.tfr in specified destination path either on Local or HDFS file system. An existing file can be overwritten by setting the overwrite parameter to True when using the export_to_tensorflow operation. To demonstrate this, we will modify the frame, by removing some columns, and then export the frame the the same path that was previously used. Note that if the overwrite parameter is not set to True, an exception would be thrown, since there is already a file at the specified path. >>> frame.drop_columns(["population_2010", "change"]) >>> frame.export_to_tensorflow(destPath, overwrite=True) """ require_type.non_empty_str(path, "path") require_type(bool, overwrite, "overwrite") self._scala.exportToTensorflow(path, overwrite)
def join_cross(self, right): """ The join_cross operation performs a cross join operation on two frames, and returns a frame that contains the cartesian product of the two frames. Each row from the current frame is combined with each row from the right frame. Parameters ---------- :param right: (Frame) The right frame in the cross join operation. :returns: (Frame) A new frame with the results of the cross join. Notes ----- The frame returned will contain all columns from the current frame and the right frame. If a column name in the right frame already exists in the current frame, the column from the right frame will have a "_R" suffix. The order of columns after this method is called is not guaranteed. It is recommended that you rename the columns to meaningful terms prior to using the join_cross method. Examples -------- Start by creating two test frames to use with the cross join operation: >>> frame = tc.frame.create([[1],[2],[3]], [("id", int)]) >>> frame.inspect() [#] id ======= [0] 1 [1] 2 [2] 3 >>> right = tc.frame.create([["a"],["b"],["c"]], [("char", str)]) >>> right.inspect() [#] char ========= [0] a [1] b [2] c Perform a cross join on the frame with the right frame: >>> result = frame.join_cross(right) <hide> >>> result.sort(["id","char"]) </hide> >>> result.inspect() [#] id char ============= [0] 1 a [1] 1 b [2] 1 c [3] 2 a [4] 2 b [5] 2 c [6] 3 a [7] 3 b [8] 3 c Note that if the right frame has a column with the same column name as the current frame, the resulting frame will include a "_R" suffix in the column name from the right frame. For example, if we cross join the frame with itself, it will result in a frame that has two columns: 'id' and 'id_R'. >>> self_cross_join = frame.join_cross(frame) <hide> >>> self_cross_join.sort(["id","id_R"]) </hide> >>> self_cross_join.inspect() [#] id id_R ============= [0] 1 1 [1] 1 2 [2] 1 3 [3] 2 1 [4] 2 2 [5] 2 3 [6] 3 1 [7] 3 2 [8] 3 3 """ from sparktk.frame.frame import Frame require_type(Frame, right, "right") return Frame(self._tc, self._scala.joinCross(right._scala))
def export_to_json(self, path, count=0, offset=0, overwrite=False): """ Write current frame to HDFS in Json format. Parameters ---------- :param path: (str) The HDFS folder path where the files will be created. :param count: (Optional[int]) The number of records you want. Default (0), or a non-positive value, is the whole frame. :param offset: (Optional[int]) The number of rows to skip before exporting to the file. Default is zero (0). :param overwrite: (Optional[bool]) Specify whether or not to overwrite the existing file, if one already exists at the specified path. If overwrite is set to False and the file already exists, an exception is thrown. Example ------- Start out by creating a frame and then exporting it to a json file. <hide> >>> from setup import get_sandbox_path >>> file_path = get_sandbox_path("export_example.json") </hide> >>> frame = tc.frame.create([[1, 2, 3], [4, 5, 6]]) >>> frame.inspect() [#] C0 C1 C2 =============== [0] 1 2 3 [1] 4 5 6 >>> frame.export_to_json(file_path) Import the data from the json file that we just created, and then inspect the data in the frame. >>> import json >>> # function used for parsing json rows >>> def parse_json(row): ... record = json.loads(row.records) ... columns = record.values() ... columns.reverse() ... return columns >>> frame2 = tc.frame.import_json(file_path) <hide> >>> frame2.sort("records") </hide> >>> frame2.inspect() [#] records ================================= [0] {"C0":"1","C1":"2","C2":"3"} [1] {"C0":"4","C1":"5","C2":"6"} Map columns and parse json into columns: >>> frame2 = frame2.map_columns(parse_json, [('C0', int), ('C1', int), ('C2', int)]) <hide> >>> frame2.sort("C0") </hide> >>> frame2.inspect() [#] C0 C1 C2 =============== [0] 1 2 3 [1] 4 5 6 We can also modify the data in the original frame, and then export to the json file again, using the 'overwrite' parameter to specify that we want to overwrite the existing file with the new data. >>> frame.add_columns(lambda row: row.C2 * 2, ("C3", int)) <hide> >>> frame.sort("C0") </hide> >>> frame.inspect() [#] C0 C1 C2 C3 =================== [0] 1 2 3 6 [1] 4 5 6 12 >>> frame.export_to_json(file_path, overwrite=True) Again, import the data from the json file, and inspect the data in the frame. >>> frame3 = tc.frame.import_json(file_path) <hide> >>> frame3.sort("records") </hide> >>> frame3.inspect() [#] records =========================================== [0] {"C0":"1","C1":"2","C2":"3","C3":"6"} [1] {"C0":"4","C1":"5","C2":"6","C3":"12"} >>> frame3 = frame3.map_columns(parse_json, [('C0', int), ('C1', int), ('C2', int), ('C3', int)]) <hide> >>> frame3.sort("C0") </hide> >>> frame3.inspect() [#] C0 C1 C2 C3 =================== [0] 1 2 3 6 [1] 4 5 6 12 """ require_type.non_empty_str(path, "path") require_type(int, count, "count") require_type(int, offset, "offset") require_type(bool, overwrite, "overwrite") self._scala.exportToJson(path, count, offset, overwrite)
def export_to_hbase(self, table_name, key_column_name=None, family_name="familyColumn", overwrite=False): """ Write current frame to HBase table. Table must exist in HBase. Parameters ---------- :param table_name: (str) The name of the HBase table that will contain the exported frame :param key_column_name: (Optional[str]) The name of the column to be used as row key in hbase table :param family_name: (Optional[str]) The family name of the HBase table that will contain the exported frame :param overwrite: (Optional[bool]) Specify whether or not to modify an existing HBase table, if one already exists with the same name. When the table is modified, columns with the same name will be overwritten, and columns with new names will be added to the table. If overwrite is False and a table already exists with the same name, an exception is thrown. Example ------- >>> data = [[1, 0.2, -2, 5], [2, 0.4, -1, 6], [3, 0.6, 0, 7], [4, 0.8, 1, 8]] >>> schema = [('a', int), ('b', float),('c', int) ,('d', int)] >>> my_frame = tc.frame.create(data, schema) <skip> >>> my_frame.export_to_hbase("test_demo_hbase", family_name="test_family") <progress> </skip> Verify exported frame in hbase From bash shell $hbase shell hbase(main):001:0> list You should see test_demo_hbase table. Run hbase(main):001:0> scan 'test_demo_hbase' (to verify frame). Output: ROW COLUMN+CELL 0 column=test_family:a, timestamp=1464219662295, value=1 0 column=test_family:b, timestamp=1464219662295, value=0.2 0 column=test_family:c, timestamp=1464219662295, value=-2 0 column=test_family:d, timestamp=1464219662295, value=5 1 column=test_family:a, timestamp=1464219662295, value=2 1 column=test_family:b, timestamp=1464219662295, value=0.4 1 column=test_family:c, timestamp=1464219662295, value=-1 1 column=test_family:d, timestamp=1464219662295, value=6 2 column=test_family:a, timestamp=1464219662295, value=3 2 column=test_family:b, timestamp=1464219662295, value=0.6 2 column=test_family:c, timestamp=1464219662295, value=0 2 column=test_family:d, timestamp=1464219662295, value=7 3 column=test_family:a, timestamp=1464219662295, value=4 3 column=test_family:b, timestamp=1464219662295, value=0.8 3 column=test_family:c, timestamp=1464219662295, value=1 3 column=test_family:d, timestamp=1464219662295, value=8 4 row(s) in 0.1560 seconds An existing HBase table can also be modified using the 'overwrite' parameter. To demonstrate this, we will modify the frame to add a column 'e', then export the data to HBase with the same table name, and set the overwrite parameter to True. >>> my_frame.add_columns(lambda row: row.d * 10, ("e",int)) >>> my_frame.inspect() [#] a b c d e ====================== [0] 1 0.2 -2 5 50 [1] 2 0.4 -1 6 60 [2] 3 0.6 0 7 70 [3] 4 0.8 1 8 80 <skip> >>> my_frame.export_to_hbase("test_demo_hbase", family_name="test_family", overwrite=True) </skip> Run hbase(main):001:0> scan 'test_demo_hbase' (to verify updated data). Output: ROW COLUMN+CELL 0 column=test_family:a, timestamp=1486680202927, value=1 0 column=test_family:b, timestamp=1486680202927, value=0.2 0 column=test_family:c, timestamp=1486680202927, value=-2 0 column=test_family:d, timestamp=1486680202927, value=5 0 column=test_family:e, timestamp=1486680202927, value=50 1 column=test_family:a, timestamp=1486680202928, value=2 1 column=test_family:b, timestamp=1486680202928, value=0.4 1 column=test_family:c, timestamp=1486680202928, value=-1 1 column=test_family:d, timestamp=1486680202928, value=6 1 column=test_family:e, timestamp=1486680202928, value=60 2 column=test_family:a, timestamp=1486680202927, value=3 2 column=test_family:b, timestamp=1486680202927, value=0.6 2 column=test_family:c, timestamp=1486680202927, value=0 2 column=test_family:d, timestamp=1486680202927, value=7 2 column=test_family:e, timestamp=1486680202927, value=70 3 column=test_family:a, timestamp=1486680202928, value=4 3 column=test_family:b, timestamp=1486680202928, value=0.8 3 column=test_family:c, timestamp=1486680202928, value=1 3 column=test_family:d, timestamp=1486680202928, value=8 3 column=test_family:e, timestamp=1486680202928, value=80 4 row(s) in 0.0440 seconds """ if not isinstance(table_name, basestring): raise ValueError( "Unsupported 'table_name' parameter type. Expected string, but found %s." % type(table_name)) if not isinstance(family_name, basestring): raise ValueError( "Unsupported 'family_name' parameter type. Expected string, but found %s." % type(family_name)) require_type(bool, overwrite, "overwrite") self._scala.exportToHbase( table_name, self._tc.jutils.convert.to_scala_option(key_column_name), family_name, overwrite)
def test_basic(self): require_type(int, 1, "a") require_type(str, "1", "a") require_type(list, [1, 2, 3], "a")
def train(frame, observation_columns, label_column, num_trees = 1, impurity = "variance", max_depth = 4, max_bins = 100, min_instances_per_node = 1, sub_sampling_rate = 1.0, feature_subset_category = "auto", seed = None, categorical_features_info = None): """ Creates a Random Forest Regressor Model by training on the given frame Parameters ---------- :param frame: (Frame) frame frame of training data :param observation_columns: (list(str)) Column(s) containing the observations :param label_column: (str) Column name containing the label for each observation :param num_trees: (int) Number of tress in the random forest. Default is 1 :param impurity: (str) Criterion used for information gain calculation. Default value is "variance". :param max_depth: (int) Maximum depth of the tree. Default is 4 :param max_bins: (int) Maximum number of bins used for splitting features. :param min_instances_per_node: (int) Minimum number of records each child node must have after a split. :param sub_sampling_rate: (double) Fraction between 0..1 of the training data used for learning each decision tree. :param feature_subset_category: (str) Subset of observation columns, i.e., features, to consider when looking for the best split. Supported values "auto","all","sqrt","log2","onethird". If "auto" is set, this is based on num_trees: if num_trees == 1, set to "all" ; if num_trees > 1, set to "sqrt". :param seed: (Optional(int)) Random seed for bootstrapping and choosing feature subsets. Default is a randomly chosen seed. :param categorical_features_info: (Optional(Dict(str:int))) Arity of categorical features. Entry (name-> k) indicates that feature 'name' is categorical with 'k' categories indexed from 0:{0,1,...,k-1} :return: (RandomForestRegressorModel) The trained random forest regressor model Notes ----- Random Forest is a supervised ensemble learning algorithm used to perform regression. A Random Forest Regressor model is initialized, trained on columns of a frame, and used to predict the value of each observation in the frame. This model runs the Spark ML implementation of Random Forest. During training, the decision trees are trained in parallel. During prediction, the average over-all tree's predicted value is the predicted value of the random forest. """ require_type(Frame, frame, 'frame') column_list = affirm_type.list_of_str(observation_columns, "observation_columns") require_type.non_empty_str(label_column, "label_column") require_type.non_negative_int(num_trees, "num_trees") require_type.non_empty_str(impurity, "impurity") require_type.non_negative_int(max_depth, "max_depth") require_type.non_negative_int(max_bins, "max_bins") require_type.non_negative_int(min_instances_per_node, "min_instances_per_node") require_type(float, sub_sampling_rate, "sub_sampling_rate") if sub_sampling_rate > 1 or sub_sampling_rate < 0: raise ValueError("'sub_sampling_rate' parameter must have a value between 0 and 1") require_type.non_empty_str(feature_subset_category, "feature_subset_category") tc = frame._tc _scala_obj = get_scala_obj(tc) seed = int(os.urandom(2).encode('hex'), 16) if seed is None else seed scala_model = _scala_obj.train(frame._scala, tc.jutils.convert.to_scala_list_string(column_list), label_column, num_trees, impurity, max_depth, max_bins, min_instances_per_node, sub_sampling_rate, feature_subset_category, seed, __get_categorical_features_info(tc, categorical_features_info)) return RandomForestRegressorModel(tc, scala_model)
def cross_validate(frame, train_descriptors, num_folds=3, verbose=False, tc=TkContext.implicit): """ Computes k-fold cross validation on classification and regression models with the given frame and parameter values :param frame: The frame to perform cross-validation on :param train_descriptors: Tuple of model and Dictionary of model parameters and their value/values as singleton values or a list of type grid_values :param num_folds: Number of folds to run the cross-validator on :param verbose: Flag indicating if the results of each fold are to be viewed. Default is set to False :param tc: spark-tk context (provided implicitly) :return: Summary of model's performance consisting of metrics of each combination of train_descriptor values per fold and averages across all folds Example ------- >>> frame = tc.frame.create([[1,0],[2,0],[3,0],[4,0],[5,0],[6,1],[7,1],[8,1],[9,1],[10,1]],[("data", float),("label",int)]) >>> frame.inspect() [#] data label ================ [0] 1 0 [1] 2 0 [2] 3 0 [3] 4 0 [4] 5 0 [5] 6 1 [6] 7 1 [7] 8 1 [8] 9 1 [9] 10 1 >>> from sparktk.models import grid_values >>> result = tc.models.cross_validate(frame, ... [(tc.models.classification.svm, ... {"observation_columns":"data", ... "label_column":"label", ... "num_iterations": grid_values(2, 10), ... "step_size": 0.01}), ... (tc.models.classification.logistic_regression, ... {"observation_columns":"data", ... "label_column":"label", ... "num_iterations": grid_values(2, 10), ... "step_size": 0.01})], ... num_folds=2, ... verbose=True) <skip> >>> result GridPoint(descriptor=sparktk.models.classification.svm: {'num_iterations': 2, 'step_size': 0.01, 'observation_columns': 'data', 'label_column': 'label'}, metrics=accuracy = 0.5 confusion_matrix = Predicted_Pos Predicted_Neg Actual_Pos 5 0 Actual_Neg 5 0 f_measure = 0.666666666667 precision = 0.5 recall = 1.0) GridPoint(descriptor=sparktk.models.classification.svm: {'num_iterations': 10, 'step_size': 0.01, 'observation_columns': 'data', 'label_column': 'label'}, metrics=accuracy = 0.5 confusion_matrix = Predicted_Pos Predicted_Neg Actual_Pos 5 0 Actual_Neg 5 0 f_measure = 0.666666666667 precision = 0.5 recall = 1.0) GridPoint(descriptor=sparktk.models.classification.logistic_regression: {'num_iterations': 2, 'step_size': 0.01, 'observation_columns': 'data', 'label_column': 'label'}, metrics=accuracy = 0.5 confusion_matrix = Predicted_Pos Predicted_Neg Actual_Pos 5 0 Actual_Neg 5 0 f_measure = 0.666666666667 precision = 0.5 recall = 1.0) GridPoint(descriptor=sparktk.models.classification.logistic_regression: {'num_iterations': 10, 'step_size': 0.01, 'observation_columns': 'data', 'label_column': 'label'}, metrics=accuracy = 1.0 confusion_matrix = Predicted_Pos Predicted_Neg Actual_Pos 5 0 Actual_Neg 0 5 f_measure = 1.0 precision = 1.0 recall = 1.0) GridPoint(descriptor=sparktk.models.classification.svm: {'num_iterations': 2, 'step_size': 0.01, 'observation_columns': 'data', 'label_column': 'label'}, metrics=accuracy = 0.5 confusion_matrix = Predicted_Pos Predicted_Neg Actual_Pos 2 0 Actual_Neg 2 0 f_measure = 0.666666666667 precision = 0.5 recall = 1.0) GridPoint(descriptor=sparktk.models.classification.svm: {'num_iterations': 10, 'step_size': 0.01, 'observation_columns': 'data', 'label_column': 'label'}, metrics=accuracy = 0.5 confusion_matrix = Predicted_Pos Predicted_Neg Actual_Pos 2 0 Actual_Neg 2 0 f_measure = 0.666666666667 precision = 0.5 recall = 1.0) GridPoint(descriptor=sparktk.models.classification.logistic_regression: {'num_iterations': 2, 'step_size': 0.01, 'observation_columns': 'data', 'label_column': 'label'}, metrics=accuracy = 0.5 confusion_matrix = Predicted_Pos Predicted_Neg Actual_Pos 2 0 Actual_Neg 2 0 f_measure = 0.666666666667 precision = 0.5 recall = 1.0) GridPoint(descriptor=sparktk.models.classification.logistic_regression: {'num_iterations': 10, 'step_size': 0.01, 'observation_columns': 'data', 'label_column': 'label'}, metrics=accuracy = 1.0 confusion_matrix = Predicted_Pos Predicted_Neg Actual_Pos 2 0 Actual_Neg 0 2 f_measure = 1.0 precision = 1.0 recall = 1.0) ******Averages: ****** GridPoint(descriptor=sparktk.models.classification.svm: {'num_iterations': 2, 'step_size': 0.01, 'observation_columns': 'data', 'label_column': 'label'}, metrics=accuracy = 0.5 confusion_matrix = Predicted_Pos Predicted_Neg Actual_Pos 5 0 Actual_Neg 5 0 f_measure = 0.666666666667 precision = 0.5 recall = 1.0) GridPoint(descriptor=sparktk.models.classification.svm: {'num_iterations': 10, 'step_size': 0.01, 'observation_columns': 'data', 'label_column': 'label'}, metrics=accuracy = 0.5 confusion_matrix = Predicted_Pos Predicted_Neg Actual_Pos 5 0 Actual_Neg 5 0 f_measure = 0.666666666667 precision = 0.5 recall = 1.0) GridPoint(descriptor=sparktk.models.classification.logistic_regression: {'num_iterations': 2, 'step_size': 0.01, 'observation_columns': 'data', 'label_column': 'label'}, metrics=accuracy = 0.5 confusion_matrix = Predicted_Pos Predicted_Neg Actual_Pos 5 0 Actual_Neg 5 0 f_measure = 0.666666666667 precision = 0.5 recall = 1.0) GridPoint(descriptor=sparktk.models.classification.logistic_regression: {'num_iterations': 10, 'step_size': 0.01, 'observation_columns': 'data', 'label_column': 'label'}, metrics=accuracy = 1.0 confusion_matrix = Predicted_Pos Predicted_Neg Actual_Pos 5 0 Actual_Neg 0 5 f_measure = 1.0 precision = 1.0 recall = 1.0) >>> result.averages GridPoint(descriptor=sparktk.models.classification.svm: {'num_iterations': 2, 'step_size': 0.01, 'observation_columns': 'data', 'label_column': 'label'}, metrics=accuracy = 0.5 confusion_matrix = Predicted_Pos Predicted_Neg Actual_Pos 5 0 Actual_Neg 5 0 f_measure = 0.666666666667 precision = 0.5 recall = 1.0) GridPoint(descriptor=sparktk.models.classification.svm: {'num_iterations': 10, 'step_size': 0.01, 'observation_columns': 'data', 'label_column': 'label'}, metrics=accuracy = 0.5 confusion_matrix = Predicted_Pos Predicted_Neg Actual_Pos 5 0 Actual_Neg 5 0 f_measure = 0.666666666667 precision = 0.5 recall = 1.0) GridPoint(descriptor=sparktk.models.classification.logistic_regression: {'num_iterations': 2, 'step_size': 0.01, 'observation_columns': 'data', 'label_column': 'label'}, metrics=accuracy = 0.5 confusion_matrix = Predicted_Pos Predicted_Neg Actual_Pos 5 0 Actual_Neg 5 0 f_measure = 0.666666666667 precision = 0.5 recall = 1.0) GridPoint(descriptor=sparktk.models.classification.logistic_regression: {'num_iterations': 10, 'step_size': 0.01, 'observation_columns': 'data', 'label_column': 'label'}, metrics=accuracy = 1.0 confusion_matrix = Predicted_Pos Predicted_Neg Actual_Pos 5 0 Actual_Neg 0 5 f_measure = 1.0 precision = 1.0 recall = 1.0) </skip> """ TkContext.validate(tc) arguments.require_type(Frame, frame, "frame") all_grid_search_results = [] grid_search_results_accumulator = None for train_frame, test_frame in split_data(frame, num_folds, tc): scores = grid_search(train_frame, test_frame, train_descriptors, tc=tc) if grid_search_results_accumulator is None: grid_search_results_accumulator = scores else: grid_search_results_accumulator._accumulate_matching_points( scores.grid_points) all_grid_search_results.append(scores) # make the accumulator hold averages grid_search_results_accumulator._divide_metrics(num_folds) return CrossValidationResults(all_grid_search_results, grid_search_results_accumulator.copy(), verbose)
def cross_validate(frame, train_descriptors, num_folds=3, verbose=False, tc=TkContext.implicit): """ Computes k-fold cross validation on model with the given frame and parameter values :param frame: The frame to perform cross-validation on :param train_descriptors: Tuple of model and Dictionary of model parameters and their value/values as singleton values or a list of type grid_values :param num_folds: Number of folds to run the cross-validator on :param verbose: Flag indicating if the results of each fold are to be viewed. Default is set to False :param tc: spark-tk context (provided implicitly) :return: Summary of model's performance consisting of metrics of each combination of train_descriptor values per fold and averages across all folds Example ------- >>> frame = tc.frame.create([[1,0],[2,0],[3,0],[4,0],[5,0],[6,1],[7,1],[8,1],[9,1],[10,1]],[("data", float),("label",int)]) >>> frame.inspect() [#] data label ================ [0] 1 0 [1] 2 0 [2] 3 0 [3] 4 0 [4] 5 0 [5] 6 1 [6] 7 1 [7] 8 1 [8] 9 1 [9] 10 1 >>> from sparktk.models import grid_values >>> result = tc.models.cross_validate(frame, ... [(tc.models.classification.svm, ... {"observation_columns":"data", ... "label_column":"label", ... "num_iterations": grid_values(2, 10), ... "step_size": 0.01}), ... (tc.models.classification.logistic_regression, ... {"observation_columns":"data", ... "label_column":"label", ... "num_iterations": grid_values(2, 10), ... "step_size": 0.01})], ... num_folds=2, ... verbose=True) <skip> >>> result GridPoint(descriptor=sparktk.models.classification.svm: {'num_iterations': 2, 'step_size': 0.01, 'observation_columns': 'data', 'label_column': 'label'}, metrics=accuracy = 0.5 confusion_matrix = Predicted_Pos Predicted_Neg Actual_Pos 5 0 Actual_Neg 5 0 f_measure = 0.666666666667 precision = 0.5 recall = 1.0) GridPoint(descriptor=sparktk.models.classification.svm: {'num_iterations': 10, 'step_size': 0.01, 'observation_columns': 'data', 'label_column': 'label'}, metrics=accuracy = 0.5 confusion_matrix = Predicted_Pos Predicted_Neg Actual_Pos 5 0 Actual_Neg 5 0 f_measure = 0.666666666667 precision = 0.5 recall = 1.0) GridPoint(descriptor=sparktk.models.classification.logistic_regression: {'num_iterations': 2, 'step_size': 0.01, 'observation_columns': 'data', 'label_column': 'label'}, metrics=accuracy = 0.5 confusion_matrix = Predicted_Pos Predicted_Neg Actual_Pos 5 0 Actual_Neg 5 0 f_measure = 0.666666666667 precision = 0.5 recall = 1.0) GridPoint(descriptor=sparktk.models.classification.logistic_regression: {'num_iterations': 10, 'step_size': 0.01, 'observation_columns': 'data', 'label_column': 'label'}, metrics=accuracy = 1.0 confusion_matrix = Predicted_Pos Predicted_Neg Actual_Pos 5 0 Actual_Neg 0 5 f_measure = 1.0 precision = 1.0 recall = 1.0) GridPoint(descriptor=sparktk.models.classification.svm: {'num_iterations': 2, 'step_size': 0.01, 'observation_columns': 'data', 'label_column': 'label'}, metrics=accuracy = 0.5 confusion_matrix = Predicted_Pos Predicted_Neg Actual_Pos 2 0 Actual_Neg 2 0 f_measure = 0.666666666667 precision = 0.5 recall = 1.0) GridPoint(descriptor=sparktk.models.classification.svm: {'num_iterations': 10, 'step_size': 0.01, 'observation_columns': 'data', 'label_column': 'label'}, metrics=accuracy = 0.5 confusion_matrix = Predicted_Pos Predicted_Neg Actual_Pos 2 0 Actual_Neg 2 0 f_measure = 0.666666666667 precision = 0.5 recall = 1.0) GridPoint(descriptor=sparktk.models.classification.logistic_regression: {'num_iterations': 2, 'step_size': 0.01, 'observation_columns': 'data', 'label_column': 'label'}, metrics=accuracy = 0.5 confusion_matrix = Predicted_Pos Predicted_Neg Actual_Pos 2 0 Actual_Neg 2 0 f_measure = 0.666666666667 precision = 0.5 recall = 1.0) GridPoint(descriptor=sparktk.models.classification.logistic_regression: {'num_iterations': 10, 'step_size': 0.01, 'observation_columns': 'data', 'label_column': 'label'}, metrics=accuracy = 1.0 confusion_matrix = Predicted_Pos Predicted_Neg Actual_Pos 2 0 Actual_Neg 0 2 f_measure = 1.0 precision = 1.0 recall = 1.0) ******Averages: ****** GridPoint(descriptor=sparktk.models.classification.svm: {'num_iterations': 2, 'step_size': 0.01, 'observation_columns': 'data', 'label_column': 'label'}, metrics=accuracy = 0.5 confusion_matrix = Predicted_Pos Predicted_Neg Actual_Pos 5 0 Actual_Neg 5 0 f_measure = 0.666666666667 precision = 0.5 recall = 1.0) GridPoint(descriptor=sparktk.models.classification.svm: {'num_iterations': 10, 'step_size': 0.01, 'observation_columns': 'data', 'label_column': 'label'}, metrics=accuracy = 0.5 confusion_matrix = Predicted_Pos Predicted_Neg Actual_Pos 5 0 Actual_Neg 5 0 f_measure = 0.666666666667 precision = 0.5 recall = 1.0) GridPoint(descriptor=sparktk.models.classification.logistic_regression: {'num_iterations': 2, 'step_size': 0.01, 'observation_columns': 'data', 'label_column': 'label'}, metrics=accuracy = 0.5 confusion_matrix = Predicted_Pos Predicted_Neg Actual_Pos 5 0 Actual_Neg 5 0 f_measure = 0.666666666667 precision = 0.5 recall = 1.0) GridPoint(descriptor=sparktk.models.classification.logistic_regression: {'num_iterations': 10, 'step_size': 0.01, 'observation_columns': 'data', 'label_column': 'label'}, metrics=accuracy = 1.0 confusion_matrix = Predicted_Pos Predicted_Neg Actual_Pos 5 0 Actual_Neg 0 5 f_measure = 1.0 precision = 1.0 recall = 1.0) >>> result.averages GridPoint(descriptor=sparktk.models.classification.svm: {'num_iterations': 2, 'step_size': 0.01, 'observation_columns': 'data', 'label_column': 'label'}, metrics=accuracy = 0.5 confusion_matrix = Predicted_Pos Predicted_Neg Actual_Pos 5 0 Actual_Neg 5 0 f_measure = 0.666666666667 precision = 0.5 recall = 1.0) GridPoint(descriptor=sparktk.models.classification.svm: {'num_iterations': 10, 'step_size': 0.01, 'observation_columns': 'data', 'label_column': 'label'}, metrics=accuracy = 0.5 confusion_matrix = Predicted_Pos Predicted_Neg Actual_Pos 5 0 Actual_Neg 5 0 f_measure = 0.666666666667 precision = 0.5 recall = 1.0) GridPoint(descriptor=sparktk.models.classification.logistic_regression: {'num_iterations': 2, 'step_size': 0.01, 'observation_columns': 'data', 'label_column': 'label'}, metrics=accuracy = 0.5 confusion_matrix = Predicted_Pos Predicted_Neg Actual_Pos 5 0 Actual_Neg 5 0 f_measure = 0.666666666667 precision = 0.5 recall = 1.0) GridPoint(descriptor=sparktk.models.classification.logistic_regression: {'num_iterations': 10, 'step_size': 0.01, 'observation_columns': 'data', 'label_column': 'label'}, metrics=accuracy = 1.0 confusion_matrix = Predicted_Pos Predicted_Neg Actual_Pos 5 0 Actual_Neg 0 5 f_measure = 1.0 precision = 1.0 recall = 1.0) </skip> """ TkContext.validate(tc) arguments.require_type(Frame, frame, "frame") all_grid_search_results = [] grid_search_results_accumulator = None for train_frame, test_frame in split_data(frame, num_folds , tc): scores = grid_search(train_frame, test_frame, train_descriptors, tc) if grid_search_results_accumulator is None: grid_search_results_accumulator = scores else: grid_search_results_accumulator._accumulate_matching_points(scores.grid_points) all_grid_search_results.append(scores) # make the accumulator hold averages grid_search_results_accumulator._divide_metrics(num_folds) return CrossValidateClassificationResults(all_grid_search_results, grid_search_results_accumulator.copy(), verbose)
def grid_search(train_frame, test_frame, train_descriptors, tc= TkContext.implicit): """ Implements grid search by training the specified model on all combinations of descriptor and testing on test frame :param train_frame: The frame to train the model on :param test_frame: The frame to test the model on :param train_descriptors: Tuple of model and Dictionary of model parameters and their value/values as singleton values or a list of type grid_values :param tc: spark-tk context passed implicitly :return: Summary of metrics for different combinations of the grid and the best performing parameter combination Example ------- >>> frame = tc.frame.create([[1,0],[2,0],[3,0],[4,0],[5,0],[6,1],[7,1],[8,1],[9,1],[10,1]],[("data", float),("label",int)]) >>> frame.inspect() [#] data label ================ [0] 1 0 [1] 2 0 [2] 3 0 [3] 4 0 [4] 5 0 [5] 6 1 [6] 7 1 [7] 8 1 [8] 9 1 [9] 10 1 >>> from sparktk.models import grid_values >>> grid_result = tc.models.grid_search(frame, frame, ... [(tc.models.classification.svm, ... {"observation_columns":"data", ... "label_column":"label", ... "num_iterations": grid_values(2, 10), ... "step_size": 0.01}), ... (tc.models.classification.logistic_regression, ... {"observation_columns":"data", ... "label_column":"label", ... "num_iterations": grid_values(2, 10), ... "step_size": 0.01})]) >>> grid_result GridPoint(descriptor=sparktk.models.classification.svm: {'num_iterations': 2, 'step_size': 0.01, 'observation_columns': 'data', 'label_column': 'label'}, metrics=accuracy = 0.5 confusion_matrix = Predicted_Pos Predicted_Neg Actual_Pos 5 0 Actual_Neg 5 0 f_measure = 0.666666666667 precision = 0.5 recall = 1.0) GridPoint(descriptor=sparktk.models.classification.svm: {'num_iterations': 10, 'step_size': 0.01, 'observation_columns': 'data', 'label_column': 'label'}, metrics=accuracy = 0.5 confusion_matrix = Predicted_Pos Predicted_Neg Actual_Pos 5 0 Actual_Neg 5 0 f_measure = 0.666666666667 precision = 0.5 recall = 1.0) GridPoint(descriptor=sparktk.models.classification.logistic_regression: {'num_iterations': 2, 'step_size': 0.01, 'observation_columns': 'data', 'label_column': 'label'}, metrics=accuracy = 0.5 confusion_matrix = Predicted_Pos Predicted_Neg Actual_Pos 5 0 Actual_Neg 5 0 f_measure = 0.666666666667 precision = 0.5 recall = 1.0) GridPoint(descriptor=sparktk.models.classification.logistic_regression: {'num_iterations': 10, 'step_size': 0.01, 'observation_columns': 'data', 'label_column': 'label'}, metrics=accuracy = 1.0 confusion_matrix = Predicted_Pos Predicted_Neg Actual_Pos 5 0 Actual_Neg 0 5 f_measure = 1.0 precision = 1.0 recall = 1.0) >>> grid_result.find_best() GridPoint(descriptor=sparktk.models.classification.logistic_regression: {'num_iterations': 10, 'step_size': 0.01, 'observation_columns': 'data', 'label_column': 'label'}, metrics=accuracy = 1.0 confusion_matrix = Predicted_Pos Predicted_Neg Actual_Pos 5 0 Actual_Neg 0 5 f_measure = 1.0 precision = 1.0 recall = 1.0) >>> grid_result.grid_points [GridPoint(descriptor=sparktk.models.classification.svm: {'num_iterations': 2, 'step_size': 0.01, 'observation_columns': 'data', 'label_column': 'label'}, metrics=accuracy = 0.5 confusion_matrix = Predicted_Pos Predicted_Neg Actual_Pos 5 0 Actual_Neg 5 0 f_measure = 0.666666666667 precision = 0.5 recall = 1.0), GridPoint(descriptor=sparktk.models.classification.svm: {'num_iterations': 10, 'step_size': 0.01, 'observation_columns': 'data', 'label_column': 'label'}, metrics=accuracy = 0.5 confusion_matrix = Predicted_Pos Predicted_Neg Actual_Pos 5 0 Actual_Neg 5 0 f_measure = 0.666666666667 precision = 0.5 recall = 1.0), GridPoint(descriptor=sparktk.models.classification.logistic_regression: {'num_iterations': 2, 'step_size': 0.01, 'observation_columns': 'data', 'label_column': 'label'}, metrics=accuracy = 0.5 confusion_matrix = Predicted_Pos Predicted_Neg Actual_Pos 5 0 Actual_Neg 5 0 f_measure = 0.666666666667 precision = 0.5 recall = 1.0), GridPoint(descriptor=sparktk.models.classification.logistic_regression: {'num_iterations': 10, 'step_size': 0.01, 'observation_columns': 'data', 'label_column': 'label'}, metrics=accuracy = 1.0 confusion_matrix = Predicted_Pos Predicted_Neg Actual_Pos 5 0 Actual_Neg 0 5 f_measure = 1.0 precision = 1.0 recall = 1.0)] >>> grid_result.grid_points[1] GridPoint(descriptor=sparktk.models.classification.svm: {'num_iterations': 10, 'step_size': 0.01, 'observation_columns': 'data', 'label_column': 'label'}, metrics=accuracy = 0.5 confusion_matrix = Predicted_Pos Predicted_Neg Actual_Pos 5 0 Actual_Neg 5 0 f_measure = 0.666666666667 precision = 0.5 recall = 1.0) """ # validate input TkContext.validate(tc) descriptors = affirm_type.list_of_anything(train_descriptors, "train_descriptors") for i in xrange(len(descriptors)): item = descriptors[i] if not isinstance(item, TrainDescriptor): require_type(tuple, item, "item", "grid_search needs a list of items which are either of type TrainDescriptor or tuples of (model, train_kwargs)") if len(item) != 2: raise value_error("list requires tuples of len 2", item, "item in train_descriptors") if not hasattr(item[0], 'train'): raise value_error("first item in tuple needs to be a object with a 'train' function", item, "item in train_descriptors") descriptors[i] = TrainDescriptor(item[0], item[1]) arguments.require_type(Frame, train_frame, "frame") arguments.require_type(Frame, test_frame, "frame") grid_points = [] for descriptor in descriptors: train_method = getattr(descriptor.model_type, "train") list_of_kwargs = expand_kwarg_grids([descriptor.kwargs]) for kwargs in list_of_kwargs: train_kwargs = dict(kwargs) train_kwargs['frame'] = train_frame validate_call(train_method, train_kwargs, ignore_self=True) model = descriptor.model_type.train(**train_kwargs) test_kwargs = dict(kwargs) test_kwargs['frame'] = test_frame test_kwargs = extract_call(model.test, test_kwargs, ignore_self=True) metrics = model.test(**test_kwargs) grid_points.append(GridPoint(descriptor=TrainDescriptor(descriptor.model_type, train_kwargs), metrics=metrics)) return GridSearchResults(grid_points)
def import_csv(path, delimiter=",", header=False, schema=None, datetime_format="yyyy-MM-dd'T'HH:mm:ss.SSSX", tc=TkContext.implicit): """ Creates a frame with data from a csv file. Parameters ---------- :param path: (str) Full path to the csv file :param delimiter: (Optional[str]) A string which indicates the separation of data fields. This is usually a single character and could be a non-visible character, such as a tab. The default delimiter is a comma (,). :param header: (Optional[bool]) Boolean value indicating if the first line of the file will be used to name columns (unless a schema is provided), and not be included in the data. The default value is false. :param schema: (Optional(list[tuple(str, type)] or list[str])) The are different options for specifying a schema: * Provide the full schema for the frame as a list of tuples (string column name and data type) * Provide the column names as a list of strings. Column data types will be inferred, based on the data. The column names specified will override column names that are found in the header row. * None, where the schema is automatically inferred based on the data. Columns are named based on the header, or will be named generically ("C0", "C1", "C2", etc). :param datetime_format: (str) String specifying how date/time columns are formatted, using the java.text.SimpleDateFormat specified at https://docs.oracle.com/javase/7/docs/api/java/text/SimpleDateFormat.html :return: (Frame) Frame that contains the data from the csv file Examples -------- Load a frame from a csv file by specifying the path to the file, delimiter >>> file_path = "../datasets/cities.csv" >>> frame = tc.frame.import_csv(file_path, "|", header=True) -etc- >>> frame.inspect() [#] rank city population_2013 population_2010 change county ============================================================================ [0] 1 Portland 609456 583776 4.40% Multnomah [1] 2 Salem 160614 154637 3.87% Marion [2] 3 Eugene 159190 156185 1.92% Lane [3] 4 Gresham 109397 105594 3.60% Multnomah [4] 5 Hillsboro 97368 91611 6.28% Washington [5] 6 Beaverton 93542 89803 4.16% Washington [6] 15 Grants Pass 35076 34533 1.57% Josephine [7] 16 Oregon City 34622 31859 8.67% Clackamas [8] 17 McMinnville 33131 32187 2.93% Yamhill [9] 18 Redmond 27427 26215 4.62% Deschutes >>> frame.schema [('rank', <type 'int'>), ('city', <type 'str'>), ('population_2013', <type 'int'>), ('population_2010', <type 'int'>), ('change', <type 'str'>), ('county', <type 'str'>)] The schema parameter can be used to specify a custom schema (column names and data types) or column names (and the data types are inferred based on the data). Here, we will specify the column names, which will override the header from the csv file. >>> column_names = ["Rank", "City", "2013", "2010", "Percent_Change", "County"] >>> frame = tc.frame.import_csv(file_path, "|", header=True, schema=column_names) -etc- >>> frame.schema [('Rank', <type 'int'>), ('City', <type 'str'>), ('2013', <type 'int'>), ('2010', <type 'int'>), ('Percent_Change', <type 'str'>), ('County', <type 'str'>)] <hide> >>> file_path = "../datasets/unicode.csv" >>> schema = [("a", unicode),("b", unicode),("c",unicode)] >>> frame = tc.frame.import_csv(file_path, schema=schema, header=False) -etc- >>> print unicode(frame.get_inspect()).encode('utf-8') # because this file is UT8 and this docstring is str [#] a b c ============ [0] à ë ñ [1] ã ê ü </hide> """ TkContext.validate(tc) require_type.non_empty_str(path, "path") require_type.non_empty_str(delimiter, "delimiter") require_type(bool, header, "header") require_type(str, datetime_format, "datetime_format") infer_schema = True column_names = [] # custom column names if schema is not None: if not isinstance(schema, list): raise TypeError("Unsupported type %s for schema parameter." % type(schema)) elif all(isinstance(item, basestring) for item in schema): # schema is just column names column_names = schema schema = None else: infer_schema = False # if a custom schema is provided, don't waste time inferring the schema during load sparktk_schema.validate(schema) header_str = str(header).lower() infer_schema_str = str(infer_schema).lower() pyspark_schema = None if schema is not None: fields = [] for column in schema: if dtypes._data_type_to_pyspark_type_table.has_key(column[1]): fields.append(StructField(column[0], dtypes._data_type_to_pyspark_type_table[column[1]], True)) else: raise TypeError("Unsupported type {0} in schema for column {1}.".format(column[1], column[0])) pyspark_schema = StructType(fields) df = tc.sql_context.read.format( "com.databricks.spark.csv.org.trustedanalytics.sparktk").options( delimiter=delimiter, header=header_str, dateformat=datetime_format, inferschema=infer_schema_str).load(path, schema=pyspark_schema) df_schema = [] if schema is None: for i, column in enumerate(df.schema.fields): try: datatype = dtypes.dtypes.get_primitive_type_from_pyspark_type(type(column.dataType)) except ValueError: raise TypeError("Unsupported data type ({0}) for column {1}.".format(str(column.dataType), column.name)) column_name = column_names[i] if (i < len(column_names)) else column.name df_schema.append((column_name, datatype)) else: df_column_count = len(df.schema.fields) custom_column_count = len(schema) if (df_column_count != custom_column_count): raise ValueError("Bad schema value. The number of columns in the custom schema ({0}) must match the" "number of columns in the csv file data ({1}).".format(custom_column_count, df_column_count)) df_schema = schema def cast_datetime(row): """ The spark data frame gives uses datetime objects. Convert them to long (ms since epoch) for our frame. """ data = [] for column_index in xrange(0, len(df_schema)): if df_schema[column_index][1] == dtypes.datetime and isinstance(row[column_index], datetime): data.append(long(dtypes.datetime_to_ms(row[column_index]))) else: data.append(row[column_index]) return data jrdd = tc.sc._jvm.org.trustedanalytics.sparktk.frame.internal.rdd.PythonJavaRdd.scalaToPython(df._jdf.rdd()) rdd = RDD(jrdd, tc.sc) if any(c[1] == dtypes.datetime for c in df_schema): # If any columns are date/time we must do this map rdd = df.rdd.map(cast_datetime) from sparktk.frame.frame import Frame # circular dependency, so import late return Frame(tc, rdd, df_schema)
def train(frame, source_column_name, dest_column_name, weight_column_name, max_steps=10, regularization=0.5, alpha=0.5, num_factors=3, use_implicit=False, num_user_blocks=2, num_item_blocks=3, checkpoint_iterations=10, target_rmse=0.05): """ Create collaborative filtering model by training on given frame Parameters ---------- :param frame: (Frame) The frame containing the data to train on :param source_column_name: (str) source column name. :param dest_column_name: (str) destination column name. :param weight_column_name: (str) weight column name. :param max_steps: (int) max number of super-steps (max iterations) before the algorithm terminates. Default = 10 :param regularization: (float) value between 0 .. 1 :param alpha: (double) value between 0 .. 1 :param num_factors: (int) number of the desired factors (rank) :param use_implicit: (bool) use implicit preference :param num_user_blocks: (int) number of user blocks :param num_item_blocks: (int) number of item blocks :param checkpoint_iterations: (int) Number of iterations between checkpoints :param target_rmse: (double) target RMSE :return: (CollaborativeFilteringModel) A trained collaborative filtering model """ from sparktk.frame.frame import Frame require_type(Frame, frame, 'frame') require_type.non_empty_str(source_column_name, "source_column_name") require_type.non_empty_str(dest_column_name, "dest_column_name") require_type.non_empty_str(weight_column_name, "weight_column_name") require_type.non_negative_int(max_steps, "max_steps") require_type(float, regularization, "regularization") if regularization > 1 or regularization < 0: raise ValueError("'regularization' parameter must have a value between 0 and 1") require_type(float, alpha, "alpha") if alpha > 1 or alpha < 0: raise ValueError("'alpha' parameter must have a value between 0 and 1") require_type.non_negative_int(num_factors, "num_factors") require_type(bool, use_implicit, "use_implicit") require_type.non_negative_int(num_user_blocks, "num_user_blocks") require_type.non_negative_int(num_item_blocks, "num_item_blocks") require_type.non_negative_int(checkpoint_iterations, "checkpoint_iterations") require_type(float, target_rmse, "target_rmse") tc = frame._tc _scala_obj = get_scala_obj(tc) scala_model = _scala_obj.train(frame._scala, source_column_name, dest_column_name, weight_column_name, max_steps, regularization, alpha, num_factors, use_implicit, num_user_blocks, num_item_blocks, checkpoint_iterations, target_rmse) return CollaborativeFilteringModel(tc, scala_model)
def train(frame, source_column_name, dest_column_name, weight_column_name, max_steps=10, regularization=0.5, alpha=0.5, num_factors=3, use_implicit=False, num_user_blocks=2, num_item_blocks=3, checkpoint_iterations=10, target_rmse=0.05): """ Create collaborative filtering model by training on given frame Parameters ---------- :param frame: (Frame) The frame containing the data to train on :param source_column_name: (str) source column name. :param dest_column_name: (str) destination column name. :param weight_column_name: (str) weight column name. :param max_steps: (int) max number of super-steps (max iterations) before the algorithm terminates. Default = 10 :param regularization: (float) value between 0 .. 1 :param alpha: (double) value between 0 .. 1 :param num_factors: (int) number of the desired factors (rank) :param use_implicit: (bool) use implicit preference :param num_user_blocks: (int) number of user blocks :param num_item_blocks: (int) number of item blocks :param checkpoint_iterations: (int) Number of iterations between checkpoints :param target_rmse: (double) target RMSE :return: (CollaborativeFilteringModel) A trained collaborative filtering model """ from sparktk.frame.frame import Frame require_type(Frame, frame, 'frame') require_type.non_empty_str(source_column_name, "source_column_name") require_type.non_empty_str(dest_column_name, "dest_column_name") require_type.non_empty_str(weight_column_name, "weight_column_name") require_type.non_negative_int(max_steps, "max_steps") require_type(float, regularization, "regularization") if regularization > 1 or regularization < 0: raise ValueError( "'regularization' parameter must have a value between 0 and 1") require_type(float, alpha, "alpha") if alpha > 1 or alpha < 0: raise ValueError("'alpha' parameter must have a value between 0 and 1") require_type.non_negative_int(num_factors, "num_factors") require_type(bool, use_implicit, "use_implicit") require_type.non_negative_int(num_user_blocks, "num_user_blocks") require_type.non_negative_int(num_item_blocks, "num_item_blocks") require_type.non_negative_int(checkpoint_iterations, "checkpoint_iterations") require_type(float, target_rmse, "target_rmse") tc = frame._tc _scala_obj = get_scala_obj(tc) scala_model = _scala_obj.train(frame._scala, source_column_name, dest_column_name, weight_column_name, max_steps, regularization, alpha, num_factors, use_implicit, num_user_blocks, num_item_blocks, checkpoint_iterations, target_rmse) return CollaborativeFilteringModel(tc, scala_model)
def import_csv(path, delimiter=",", header=False, schema=None, datetime_format="yyyy-MM-dd'T'HH:mm:ss.SSSX", tc=TkContext.implicit): """ Creates a frame with data from a csv file. Parameters ---------- :param path: (str) Full path to the csv file :param delimiter: (Optional[str]) A string which indicates the separation of data fields. This is usually a single character and could be a non-visible character, such as a tab. The default delimiter is a comma (,). :param header: (Optional[bool]) Boolean value indicating if the first line of the file will be used to name columns (unless a schema is provided), and not be included in the data. The default value is false. :param schema: (Optional(list[tuple(str, type)] or list[str])) The are different options for specifying a schema: * Provide the full schema for the frame as a list of tuples (string column name and data type) * Provide the column names as a list of strings. Column data types will be inferred, based on thedata. The column names specified will override column names that are found in the header row. * None, where the schema is automatically inferred based on the data. Columns are named based on the header, or will be named generically ("C0", "C1", "C2", etc). :param datetime_format: (str) String specifying how date/time columns are formatted, using the java.text.SimpleDateFormat specified at https://docs.oracle.com/javase/7/docs/api/java/text/SimpleDateFormat.html :return: (Frame) Frame that contains the data from the csv file Examples -------- Load a frame from a csv file by specifying the path to the file, delimiter >>> file_path = "../datasets/cities.csv" >>> frame = tc.frame.import_csv(file_path, "|", header=True) -etc- >>> frame.inspect() [#] rank city population_2013 population_2010 change county ============================================================================ [0] 1 Portland 609456 583776 4.40% Multnomah [1] 2 Salem 160614 154637 3.87% Marion [2] 3 Eugene 159190 156185 1.92% Lane [3] 4 Gresham 109397 105594 3.60% Multnomah [4] 5 Hillsboro 97368 91611 6.28% Washington [5] 6 Beaverton 93542 89803 4.16% Washington [6] 15 Grants Pass 35076 34533 1.57% Josephine [7] 16 Oregon City 34622 31859 8.67% Clackamas [8] 17 McMinnville 33131 32187 2.93% Yamhill [9] 18 Redmond 27427 26215 4.62% Deschutes >>> frame.schema [('rank', <type 'int'>), ('city', <type 'str'>), ('population_2013', <type 'int'>), ('population_2010', <type 'int'>), ('change', <type 'str'>), ('county', <type 'str'>)] The schema parameter can be used to specify a custom schema (column names and data types) or column names (and the data types are inferred based on the data). Here, we will specify the column names, which will override the header from the csv file. >>> column_names = ["Rank", "City", "2013", "2010", "Percent_Change", "County"] >>> frame = tc.frame.import_csv(file_path, "|", header=True, schema=column_names) -etc- >>> frame.schema [('Rank', <type 'int'>), ('City', <type 'str'>), ('2013', <type 'int'>), ('2010', <type 'int'>), ('Percent_Change', <type 'str'>), ('County', <type 'str'>)] <hide> >>> file_path = "../datasets/unicode.csv" >>> schema = [("a", unicode),("b", unicode),("c",unicode)] >>> frame = tc.frame.import_csv(file_path, schema=schema, header=False) -etc- >>> frame.inspect() [#] a b c ============ [0] à ë ñ [1] ã ê ü </hide> """ TkContext.validate(tc) require_type.non_empty_str(path, "path") require_type.non_empty_str(delimiter, "delimiter") require_type(bool, header, "header") require_type(str, datetime_format, "datetime_format") infer_schema = True column_names = [] # custom column names if schema is not None: if not isinstance(schema, list): raise TypeError("Unsupported type %s for schema parameter." % type(schema)) elif all(isinstance(item, basestring) for item in schema): # schema is just column names column_names = schema schema = None else: infer_schema = False # if a custom schema is provided, don't waste time inferring the schema during load sparktk_schema.validate(schema) header_str = str(header).lower() infer_schema_str = str(infer_schema).lower() pyspark_schema = None if schema is not None: fields = [] for column in schema: if dtypes._data_type_to_pyspark_type_table.has_key(column[1]): fields.append(StructField(column[0], dtypes._data_type_to_pyspark_type_table[column[1]], True)) else: raise TypeError("Unsupported type {0} in schema for column {1}.".format(column[1], column[0])) pyspark_schema = StructType(fields) df = tc.sql_context.read.format( "com.databricks.spark.csv.org.trustedanalytics.sparktk").options( delimiter=delimiter, header=header_str, dateformat=datetime_format, inferschema=infer_schema_str).load(path, schema=pyspark_schema) df_schema = [] if schema is None: for i, column in enumerate(df.schema.fields): try: datatype = dtypes.dtypes.get_primitive_type_from_pyspark_type(type(column.dataType)) except ValueError: raise TypeError("Unsupported data type ({0}) for column {1}.".format(str(column.dataType), column.name)) column_name = column_names[i] if (i < len(column_names)) else column.name df_schema.append((column_name, datatype)) else: df_column_count = len(df.schema.fields) custom_column_count = len(schema) if (df_column_count != custom_column_count): raise ValueError("Bad schema value. The number of columns in the custom schema ({0}) must match the" "number of columns in the csv file data ({1}).".format(custom_column_count, df_column_count)) df_schema = schema def cast_datetime(row): """ The spark data frame gives uses datetime objects. Convert them to long (ms since epoch) for our frame. """ data = [] for column_index in xrange(0, len(df_schema)): if df_schema[column_index][1] == dtypes.datetime and isinstance(row[column_index], datetime): data.append(long(dtypes.datetime_to_ms(row[column_index]))) else: data.append(row[column_index]) return data jrdd = tc.sc._jvm.org.trustedanalytics.sparktk.frame.internal.rdd.PythonJavaRdd.scalaToPython(df._jdf.rdd()) rdd = RDD(jrdd, tc.sc) if any(c[1] == dtypes.datetime for c in df_schema): # If any columns are date/time we must do this map rdd = df.rdd.map(cast_datetime) from sparktk.frame.frame import Frame # circular dependency, so import late return Frame(tc, rdd, df_schema)
def import_csv_raw(path, delimiter=",", header=False, tc=TkContext.implicit): """ Creates a frame by importing the data as strings from the specified csv file. If the csv file has a header row, those values will be used as column names. Otherwise, columns will be named generically, like 'C0', 'C1', 'C2', etc. Parameters ---------- :param path: (str) Full path to the csv file :param delimiter: (str) A string which indicates the separation of data fields. This is usually a single character and could be a non-visible character, such as a tab. The default delimiter is a comma (,). :param header: (bool) Boolean value indicating if the first line of the file will be used to name columns, and not be included in the data. The default value is false. :return: (Frame) Frame that contains the data from the csv file Examples -------- Import raw data from a csv file by specifying the path to the file, delimiter, and header option. All data will be brought in the frame as strings, and columns will be named according to the header row, if there was one. >>> file_path = "../datasets/cities.csv" >>> frame = tc.frame.import_csv_raw(file_path, delimiter="|", header=True) -etc- >>> frame.inspect() [#] rank city population_2013 population_2010 change county ============================================================================ [0] 1 Portland 609456 583776 4.40% Multnomah [1] 2 Salem 160614 154637 3.87% Marion [2] 3 Eugene 159190 156185 1.92% Lane [3] 4 Gresham 109397 105594 3.60% Multnomah [4] 5 Hillsboro 97368 91611 6.28% Washington [5] 6 Beaverton 93542 89803 4.16% Washington [6] 15 Grants Pass 35076 34533 1.57% Josephine [7] 16 Oregon City 34622 31859 8.67% Clackamas [8] 17 McMinnville 33131 32187 2.93% Yamhill [9] 18 Redmond 27427 26215 4.62% Deschutes >>> frame.schema [('rank', <type 'str'>), ('city', <type 'str'>), ('population_2013', <type 'str'>), ('population_2010', <type 'str'>), ('change', <type 'str'>), ('county', <type 'str'>)] """ TkContext.validate(tc) require_type.non_empty_str(path, "path") require_type.non_empty_str(delimiter, "delimiter") require_type(bool, header, "header") df = tc.sql_context.read.format( "com.databricks.spark.csv.org.trustedanalytics.sparktk").options( delimiter=delimiter, header=str(header).lower(), inferschema="false").load(path, schema=None) df_schema = [] for column in df.schema.fields: try: datatype = dtypes.dtypes.get_primitive_type_from_pyspark_type(type(column.dataType)) except ValueError: raise TypeError("Unsupported data type ({0}) for column {1}.".format(str(column.dataType), column.name)) df_schema.append((column.name, datatype)) jrdd = tc.sc._jvm.org.trustedanalytics.sparktk.frame.internal.rdd.PythonJavaRdd.scalaToPython(df._jdf.rdd()) rdd = RDD(jrdd, tc.sc) from sparktk.frame.frame import Frame # circular dependency, so import late return Frame(tc, rdd, df_schema)