Пример #1
0
    def jsonRDD(self, rdd):
        """Loads an RDD storing one JSON object per string, returning the result as a L{SchemaRDD}.
           It goes through the entire dataset once to determine the schema.

        >>> srdd = sqlCtx.jsonRDD(json)
        >>> sqlCtx.registerRDDAsTable(srdd, "table1")
        >>> srdd2 = sqlCtx.sql(
        ...   "SELECT field1 AS f1, field2 as f2, field3 as f3, field6 as f4 from table1")
        >>> srdd2.collect() == [
        ... {"f1":1, "f2":"row1", "f3":{"field4":11, "field5": None}, "f4":None},
        ... {"f1":2, "f2":None, "f3":{"field4":22,  "field5": [10, 11]}, "f4":[{"field7": "row2"}]},
        ... {"f1":None, "f2":"row3", "f3":{"field4":33, "field5": []}, "f4":None}]
        True
        """
        def func(split, iterator):
            for x in iterator:
                if not isinstance(x, basestring):
                    x = unicode(x)
                yield x.encode("utf-8")

        keyed = PipelinedRDD(rdd, func)
        keyed._bypass_serializer = True
        jrdd = keyed._jrdd.map(self._jvm.BytesToString())
        jschema_rdd = self._ssql_ctx.jsonRDD(jrdd.rdd())
        return SchemaRDD(jschema_rdd, self)
Пример #2
0
    def jsonRDD(self, rdd, schema=None):
        """Loads an RDD storing one JSON object per string as a L{SchemaRDD}.

        If the schema is provided, applies the given schema to this JSON dataset.
        Otherwise, it goes through the entire dataset once to determine the schema.

        >>> srdd1 = sqlCtx.jsonRDD(json)
        >>> sqlCtx.registerRDDAsTable(srdd1, "table1")
        >>> srdd2 = sqlCtx.sql(
        ...   "SELECT field1 AS f1, field2 as f2, field3 as f3, field6 as f4 from table1")
        >>> srdd2.collect() == [
        ... {"f1":1, "f2":"row1", "f3":{"field4":11, "field5": None}, "f4":None},
        ... {"f1":2, "f2":None, "f3":{"field4":22,  "field5": [10, 11]}, "f4":[{"field7": "row2"}]},
        ... {"f1":None, "f2":"row3", "f3":{"field4":33, "field5": []}, "f4":None}]
        True
        >>> srdd3 = sqlCtx.jsonRDD(json, srdd1.schema())
        >>> sqlCtx.registerRDDAsTable(srdd3, "table2")
        >>> srdd4 = sqlCtx.sql(
        ...   "SELECT field1 AS f1, field2 as f2, field3 as f3, field6 as f4 from table2")
        >>> srdd4.collect() == [
        ... {"f1":1, "f2":"row1", "f3":{"field4":11, "field5": None}, "f4":None},
        ... {"f1":2, "f2":None, "f3":{"field4":22,  "field5": [10, 11]}, "f4":[{"field7": "row2"}]},
        ... {"f1":None, "f2":"row3", "f3":{"field4":33, "field5": []}, "f4":None}]
        True
        >>> schema = StructType([
        ...     StructField("field2", StringType(), True),
        ...     StructField("field3",
        ...         StructType([
        ...             StructField("field5", ArrayType(IntegerType(), False), True)]), False)])
        >>> srdd5 = sqlCtx.jsonRDD(json, schema)
        >>> sqlCtx.registerRDDAsTable(srdd5, "table3")
        >>> srdd6 = sqlCtx.sql(
        ...   "SELECT field2 AS f1, field3.field5 as f2, field3.field5[0] as f3 from table3")
        >>> srdd6.collect() == [
        ... {"f1": "row1", "f2": None, "f3": None},
        ... {"f1": None, "f2": [10, 11], "f3": 10},
        ... {"f1": "row3", "f2": [], "f3": None}]
        True
        """
        def func(split, iterator):
            for x in iterator:
                if not isinstance(x, basestring):
                    x = unicode(x)
                yield x.encode("utf-8")

        keyed = PipelinedRDD(rdd, func)
        keyed._bypass_serializer = True
        jrdd = keyed._jrdd.map(self._jvm.BytesToString())
        if schema is None:
            jschema_rdd = self._ssql_ctx.jsonRDD(jrdd.rdd())
        else:
            scala_datatype = self._ssql_ctx.parseDataType(schema.__repr__())
            jschema_rdd = self._ssql_ctx.jsonRDD(jrdd.rdd(), scala_datatype)
        return SchemaRDD(jschema_rdd, self)
Пример #3
0
    def jsonRDD(self, rdd, schema=None):
        """Loads an RDD storing one JSON object per string as a L{SchemaRDD}.

        If the schema is provided, applies the given schema to this JSON dataset.
        Otherwise, it goes through the entire dataset once to determine the schema.

        >>> srdd1 = sqlCtx.jsonRDD(json)
        >>> sqlCtx.registerRDDAsTable(srdd1, "table1")
        >>> srdd2 = sqlCtx.sql(
        ...   "SELECT field1 AS f1, field2 as f2, field3 as f3, field6 as f4 from table1")
        >>> srdd2.collect() == [
        ... {"f1":1, "f2":"row1", "f3":{"field4":11, "field5": None}, "f4":None},
        ... {"f1":2, "f2":None, "f3":{"field4":22,  "field5": [10, 11]}, "f4":[{"field7": "row2"}]},
        ... {"f1":None, "f2":"row3", "f3":{"field4":33, "field5": []}, "f4":None}]
        True
        >>> srdd3 = sqlCtx.jsonRDD(json, srdd1.schema())
        >>> sqlCtx.registerRDDAsTable(srdd3, "table2")
        >>> srdd4 = sqlCtx.sql(
        ...   "SELECT field1 AS f1, field2 as f2, field3 as f3, field6 as f4 from table2")
        >>> srdd4.collect() == [
        ... {"f1":1, "f2":"row1", "f3":{"field4":11, "field5": None}, "f4":None},
        ... {"f1":2, "f2":None, "f3":{"field4":22,  "field5": [10, 11]}, "f4":[{"field7": "row2"}]},
        ... {"f1":None, "f2":"row3", "f3":{"field4":33, "field5": []}, "f4":None}]
        True
        >>> schema = StructType([
        ...     StructField("field2", StringType(), True),
        ...     StructField("field3",
        ...         StructType([
        ...             StructField("field5", ArrayType(IntegerType(), False), True)]), False)])
        >>> srdd5 = sqlCtx.jsonRDD(json, schema)
        >>> sqlCtx.registerRDDAsTable(srdd5, "table3")
        >>> srdd6 = sqlCtx.sql(
        ...   "SELECT field2 AS f1, field3.field5 as f2, field3.field5[0] as f3 from table3")
        >>> srdd6.collect() == [
        ... {"f1": "row1", "f2": None, "f3": None},
        ... {"f1": None, "f2": [10, 11], "f3": 10},
        ... {"f1": "row3", "f2": [], "f3": None}]
        True
        """
        def func(split, iterator):
            for x in iterator:
                if not isinstance(x, basestring):
                    x = unicode(x)
                yield x.encode("utf-8")
        keyed = PipelinedRDD(rdd, func)
        keyed._bypass_serializer = True
        jrdd = keyed._jrdd.map(self._jvm.BytesToString())
        if schema is None:
            jschema_rdd = self._ssql_ctx.jsonRDD(jrdd.rdd())
        else:
            scala_datatype = self._ssql_ctx.parseDataType(schema.__repr__())
            jschema_rdd = self._ssql_ctx.jsonRDD(jrdd.rdd(), scala_datatype)
        return SchemaRDD(jschema_rdd, self)
Пример #4
0
    def jsonRDD(self, rdd):
        """Loads an RDD storing one JSON object per string, returning the result as a L{SchemaRDD}.
           It goes through the entire dataset once to determine the schema.

        >>> srdd = sqlCtx.jsonRDD(json)
        >>> sqlCtx.registerRDDAsTable(srdd, "table1")
        >>> srdd2 = sqlCtx.sql("SELECT field1 AS f1, field2 as f2, field3 as f3 from table1")
        >>> srdd2.collect() == [{"f1": 1, "f2": "row1", "f3":{"field4":11}},
        ...                     {"f1": 2, "f2": "row2", "f3":{"field4":22}},
        ...                     {"f1": 3, "f2": "row3", "f3":{"field4":33}}]
        True
        """
        def func(split, iterator):
            for x in iterator:
                if not isinstance(x, basestring):
                    x = unicode(x)
                yield x.encode("utf-8")
        keyed = PipelinedRDD(rdd, func)
        keyed._bypass_serializer = True
        jrdd = keyed._jrdd.map(self._jvm.BytesToString())
        jschema_rdd = self._ssql_ctx.jsonRDD(jrdd.rdd())
        return SchemaRDD(jschema_rdd, self)