def jsonRDD(self, rdd): """Loads an RDD storing one JSON object per string, returning the result as a L{SchemaRDD}. It goes through the entire dataset once to determine the schema. >>> srdd = sqlCtx.jsonRDD(json) >>> sqlCtx.registerRDDAsTable(srdd, "table1") >>> srdd2 = sqlCtx.sql( ... "SELECT field1 AS f1, field2 as f2, field3 as f3, field6 as f4 from table1") >>> srdd2.collect() == [ ... {"f1":1, "f2":"row1", "f3":{"field4":11, "field5": None}, "f4":None}, ... {"f1":2, "f2":None, "f3":{"field4":22, "field5": [10, 11]}, "f4":[{"field7": "row2"}]}, ... {"f1":None, "f2":"row3", "f3":{"field4":33, "field5": []}, "f4":None}] True """ def func(split, iterator): for x in iterator: if not isinstance(x, basestring): x = unicode(x) yield x.encode("utf-8") keyed = PipelinedRDD(rdd, func) keyed._bypass_serializer = True jrdd = keyed._jrdd.map(self._jvm.BytesToString()) jschema_rdd = self._ssql_ctx.jsonRDD(jrdd.rdd()) return SchemaRDD(jschema_rdd, self)
def jsonRDD(self, rdd, schema=None): """Loads an RDD storing one JSON object per string as a L{SchemaRDD}. If the schema is provided, applies the given schema to this JSON dataset. Otherwise, it goes through the entire dataset once to determine the schema. >>> srdd1 = sqlCtx.jsonRDD(json) >>> sqlCtx.registerRDDAsTable(srdd1, "table1") >>> srdd2 = sqlCtx.sql( ... "SELECT field1 AS f1, field2 as f2, field3 as f3, field6 as f4 from table1") >>> srdd2.collect() == [ ... {"f1":1, "f2":"row1", "f3":{"field4":11, "field5": None}, "f4":None}, ... {"f1":2, "f2":None, "f3":{"field4":22, "field5": [10, 11]}, "f4":[{"field7": "row2"}]}, ... {"f1":None, "f2":"row3", "f3":{"field4":33, "field5": []}, "f4":None}] True >>> srdd3 = sqlCtx.jsonRDD(json, srdd1.schema()) >>> sqlCtx.registerRDDAsTable(srdd3, "table2") >>> srdd4 = sqlCtx.sql( ... "SELECT field1 AS f1, field2 as f2, field3 as f3, field6 as f4 from table2") >>> srdd4.collect() == [ ... {"f1":1, "f2":"row1", "f3":{"field4":11, "field5": None}, "f4":None}, ... {"f1":2, "f2":None, "f3":{"field4":22, "field5": [10, 11]}, "f4":[{"field7": "row2"}]}, ... {"f1":None, "f2":"row3", "f3":{"field4":33, "field5": []}, "f4":None}] True >>> schema = StructType([ ... StructField("field2", StringType(), True), ... StructField("field3", ... StructType([ ... StructField("field5", ArrayType(IntegerType(), False), True)]), False)]) >>> srdd5 = sqlCtx.jsonRDD(json, schema) >>> sqlCtx.registerRDDAsTable(srdd5, "table3") >>> srdd6 = sqlCtx.sql( ... "SELECT field2 AS f1, field3.field5 as f2, field3.field5[0] as f3 from table3") >>> srdd6.collect() == [ ... {"f1": "row1", "f2": None, "f3": None}, ... {"f1": None, "f2": [10, 11], "f3": 10}, ... {"f1": "row3", "f2": [], "f3": None}] True """ def func(split, iterator): for x in iterator: if not isinstance(x, basestring): x = unicode(x) yield x.encode("utf-8") keyed = PipelinedRDD(rdd, func) keyed._bypass_serializer = True jrdd = keyed._jrdd.map(self._jvm.BytesToString()) if schema is None: jschema_rdd = self._ssql_ctx.jsonRDD(jrdd.rdd()) else: scala_datatype = self._ssql_ctx.parseDataType(schema.__repr__()) jschema_rdd = self._ssql_ctx.jsonRDD(jrdd.rdd(), scala_datatype) return SchemaRDD(jschema_rdd, self)
def jsonRDD(self, rdd): """Loads an RDD storing one JSON object per string, returning the result as a L{SchemaRDD}. It goes through the entire dataset once to determine the schema. >>> srdd = sqlCtx.jsonRDD(json) >>> sqlCtx.registerRDDAsTable(srdd, "table1") >>> srdd2 = sqlCtx.sql("SELECT field1 AS f1, field2 as f2, field3 as f3 from table1") >>> srdd2.collect() == [{"f1": 1, "f2": "row1", "f3":{"field4":11}}, ... {"f1": 2, "f2": "row2", "f3":{"field4":22}}, ... {"f1": 3, "f2": "row3", "f3":{"field4":33}}] True """ def func(split, iterator): for x in iterator: if not isinstance(x, basestring): x = unicode(x) yield x.encode("utf-8") keyed = PipelinedRDD(rdd, func) keyed._bypass_serializer = True jrdd = keyed._jrdd.map(self._jvm.BytesToString()) jschema_rdd = self._ssql_ctx.jsonRDD(jrdd.rdd()) return SchemaRDD(jschema_rdd, self)