def reduceByKeyAndWindow(self, func, invFunc, windowDuration, slideDuration=None, numPartitions=None, filterFunc=None): """ Return a new DStream by applying incremental `reduceByKey` over a sliding window. The reduced value of over a new window is calculated using the old window's reduce value : 1. reduce the new values that entered the window (e.g., adding new counts) 2. "inverse reduce" the old values that left the window (e.g., subtracting old counts) `invFunc` can be None, then it will reduce all the RDDs in window, could be slower than having `invFunc`. @param func: associative and commutative reduce function @param invFunc: inverse function of `reduceFunc` @param windowDuration: width of the window; must be a multiple of this DStream's batching interval @param slideDuration: sliding interval of the window (i.e., the interval after which the new DStream will generate RDDs); must be a multiple of this DStream's batching interval @param numPartitions: number of partitions of each RDD in the new DStream. @param filterFunc: function to filter expired key-value pairs; only pairs that satisfy the function are retained set this to null if you do not want to filter """ self._validate_window_param(windowDuration, slideDuration) if numPartitions is None: numPartitions = self._sc.defaultParallelism reduced = self.reduceByKey(func, numPartitions) if invFunc: def reduceFunc(t, a, b): b = b.reduceByKey(func, numPartitions) r = a.union(b).reduceByKey(func, numPartitions) if a else b if filterFunc: r = r.filter(filterFunc) return r def invReduceFunc(t, a, b): b = b.reduceByKey(func, numPartitions) joined = a.leftOuterJoin(b, numPartitions) return joined.mapValues(lambda kv: invFunc(kv[0], kv[1]) if kv[1] is not None else kv[0]) jreduceFunc = TransformFunction(self._sc, reduceFunc, reduced._jrdd_deserializer) jinvReduceFunc = TransformFunction(self._sc, invReduceFunc, reduced._jrdd_deserializer) if slideDuration is None: slideDuration = self._slideDuration dstream = self._sc._jvm.PythonReducedWindowedDStream( reduced._jdstream.dstream(), jreduceFunc, jinvReduceFunc, self._ssc._jduration(windowDuration), self._ssc._jduration(slideDuration)) return DStream(dstream.asJavaDStream(), self._ssc, self._sc.serializer) else: return reduced.window(windowDuration, slideDuration).reduceByKey(func, numPartitions)
def updateStateByKey(self, updateFunc, numPartitions=None, initialRDD=None): """ Return a new "state" DStream where the state for each key is updated by applying the given function on the previous state of the key and the new values of the key. @param updateFunc: State update function. If this function returns None, then corresponding state key-value pair will be eliminated. """ if numPartitions is None: numPartitions = self._sc.defaultParallelism if initialRDD and not isinstance(initialRDD, RDD): initialRDD = self._sc.parallelize(initialRDD) def reduceFunc(t, a, b): if a is None: g = b.groupByKey(numPartitions).mapValues(lambda vs: (list(vs), None)) else: g = a.cogroup(b.partitionBy(numPartitions), numPartitions) g = g.mapValues(lambda ab: (list(ab[1]), list(ab[0])[0] if len(ab[0]) else None)) state = g.mapValues(lambda vs_s: updateFunc(vs_s[0], vs_s[1])) return state.filter(lambda k_v: k_v[1] is not None) jreduceFunc = TransformFunction(self._sc, reduceFunc, self._sc.serializer, self._jrdd_deserializer) if initialRDD: initialRDD = initialRDD._reserialize(self._jrdd_deserializer) dstream = self._sc._jvm.PythonStateDStream(self._jdstream.dstream(), jreduceFunc, initialRDD._jrdd) else: dstream = self._sc._jvm.PythonStateDStream(self._jdstream.dstream(), jreduceFunc) return DStream(dstream.asJavaDStream(), self._ssc, self._sc.serializer)
def transformWith( self: "DStream[T]", func: Union[Callable[[RDD[T], RDD[U]], RDD[V]], Callable[[datetime, RDD[T], RDD[U]], RDD[V]], ], other: "DStream[U]", keepSerializer: bool = False, ) -> "DStream[V]": """ Return a new DStream in which each RDD is generated by applying a function on each RDD of this DStream and 'other' DStream. `func` can have two arguments of (`rdd_a`, `rdd_b`) or have three arguments of (`time`, `rdd_a`, `rdd_b`) """ if func.__code__.co_argcount == 2: oldfunc = func def func(_: datetime, a: RDD[T], b: RDD[U]) -> RDD[V]: return oldfunc(a, b) # type: ignore[call-arg, arg-type] assert func.__code__.co_argcount == 3, "func should take two or three arguments" jfunc = TransformFunction( self._sc, func, self._jrdd_deserializer, other._jrdd_deserializer, ) assert self._sc._jvm is not None dstream = self._sc._jvm.PythonTransformed2DStream( self._jdstream.dstream(), other._jdstream.dstream(), jfunc) jrdd_serializer = self._jrdd_deserializer if keepSerializer else self._sc.serializer return DStream(dstream.asJavaDStream(), self._ssc, jrdd_serializer)
def foreachRDD_modified(self, func): """ Apply a function to each RDD in this DStream. """ if func.__code__.co_argcount == 1: old_func = func func = lambda t, rdd: old_func(rdd) jfunc = TransformFunction(self._sc, func, self._jrdd_deserializer) api = self._ssc._jvm.PythonDStream print("") print("############################################################") print("# #") print("# Opening DB Connection #") print("# #") print("############################################################") print("Spark Processing Starting: " + str(datetime.now().strftime('%Y-%m-%d %H:%M:%S'))) print("") print("------------------------------------------------------------") print("") #Create DB connection here as Spark's lazy evaluation will only create the DB connection once at the start of job global conn conn = psycopg2.connect("dbname=%s user=%s password=%s host=%s" % (os.environ['psqlDB'], os.environ['psqlUser'], os.environ['psqlPwd'], os.environ['psql'])) global cur cur = conn.cursor() api.callForeachRDD(self._jdstream, jfunc)
def updateStateByKey(self, updateFunc, numPartitions=None): """ Return a new "state" DStream where the state for each key is updated by applying the given function on the previous state of the key and the new values of the key. @param updateFunc: State update function. If this function returns None, then corresponding state key-value pair will be eliminated. """ if numPartitions is None: numPartitions = self._sc.defaultParallelism def reduceFunc(t, a, b): if a is None: g = b.groupByKey(numPartitions).mapValues(lambda vs: (list(vs), None)) else: g = a.cogroup(b.partitionBy(numPartitions), numPartitions) g = g.mapValues(lambda (va, vb): (list(vb), list(va)[0] if len(va) else None)) state = g.mapValues(lambda (vs, s): updateFunc(vs, s)) return state.filter(lambda (k, v): v is not None) jreduceFunc = TransformFunction(self._sc, reduceFunc, self._sc.serializer, self._jrdd_deserializer) dstream = self._sc._jvm.PythonStateDStream(self._jdstream.dstream(), jreduceFunc) return DStream(dstream.asJavaDStream(), self._ssc, self._sc.serializer)
def _jdstream(self): if self._jdstream_val is not None: return self._jdstream_val jfunc = TransformFunction(self._sc, self.func, self.prev._jrdd_deserializer) dstream = self._sc._jvm.PythonTransformedDStream(self.prev._jdstream.dstream(), jfunc) self._jdstream_val = dstream.asJavaDStream() return self._jdstream_val
def _jdstream(self): if self._jdstream_val is not None: return self._jdstream_val jfunc = TransformFunction(self._sc, self.func, self.prev._jrdd_deserializer) \ .rdd_wrapper(lambda jrdd, ctx, ser: KafkaRDD(jrdd, ctx, ser)) dstream = self._sc._jvm.PythonTransformedDStream(self.prev._jdstream.dstream(), jfunc) self._jdstream_val = dstream.asJavaDStream() return self._jdstream_val
def updateStateByKey( self: "DStream[Tuple[K, V]]", updateFunc: Callable[[Iterable[V], Optional[S]], S], numPartitions: Optional[int] = None, initialRDD: Optional[Union[RDD[Tuple[K, S]], Iterable[Tuple[K, S]]]] = None, ) -> "DStream[Tuple[K, S]]": """ Return a new "state" DStream where the state for each key is updated by applying the given function on the previous state of the key and the new values of the key. Parameters ---------- updateFunc : function State update function. If this function returns None, then corresponding state key-value pair will be eliminated. """ if numPartitions is None: numPartitions = self._sc.defaultParallelism if initialRDD and not isinstance(initialRDD, RDD): initialRDD = self._sc.parallelize(initialRDD) def reduceFunc(t: datetime, a: Any, b: Any) -> Any: if a is None: g = b.groupByKey(numPartitions).mapValues(lambda vs: (list(vs), None)) else: g = a.cogroup(b.partitionBy(cast(int, numPartitions)), numPartitions) g = g.mapValues(lambda ab: (list(ab[1]), list(ab[0])[0] if len(ab[0]) else None)) state = g.mapValues(lambda vs_s: updateFunc(vs_s[0], vs_s[1])) return state.filter(lambda k_v: k_v[1] is not None) jreduceFunc = TransformFunction( self._sc, reduceFunc, self._sc.serializer, self._jrdd_deserializer, ) if initialRDD: initialRDD = cast(RDD[Tuple[K, S]], initialRDD)._reserialize(self._jrdd_deserializer) assert self._sc._jvm is not None dstream = self._sc._jvm.PythonStateDStream( self._jdstream.dstream(), jreduceFunc, initialRDD._jrdd, ) else: assert self._sc._jvm is not None dstream = self._sc._jvm.PythonStateDStream( self._jdstream.dstream(), jreduceFunc) return DStream(dstream.asJavaDStream(), self._ssc, self._sc.serializer)
def foreachRDD(self, func): """ Apply a function to each RDD in this DStream. """ if func.__code__.co_argcount == 1: old_func = func func = lambda t, rdd: old_func(rdd) jfunc = TransformFunction(self._sc, func, self._jrdd_deserializer) api = self._ssc._jvm.PythonDStream api.callForeachRDD(self._jdstream, jfunc)
def transform(self, dstreams, transformFunc): """ Create a new DStream in which each RDD is generated by applying a function on RDDs of the DStreams. The order of the JavaRDDs in the transform function parameter will be the same as the order of corresponding DStreams in the list. """ jdstreams = [d._jdstream for d in dstreams] # change the final serializer to sc.serializer func = TransformFunction(self._sc, lambda t, *rdds: transformFunc(rdds), *[d._jrdd_deserializer for d in dstreams]) jfunc = self._jvm.TransformFunction(func) jdstream = self._jssc.transform(jdstreams, jfunc) return DStream(jdstream, self, self._sc.serializer)
def transformWith(self, func, other, keepSerializer=False): """ Return a new DStream in which each RDD is generated by applying a function on each RDD of this DStream and 'other' DStream. `func` can have two arguments of (`rdd_a`, `rdd_b`) or have three arguments of (`time`, `rdd_a`, `rdd_b`) """ if func.__code__.co_argcount == 2: oldfunc = func func = lambda t, a, b: oldfunc(a, b) assert func.__code__.co_argcount == 3, "func should take two or three arguments" jfunc = TransformFunction(self._sc, func, self._jrdd_deserializer, other._jrdd_deserializer) dstream = self._sc._jvm.PythonTransformed2DStream(self._jdstream.dstream(), other._jdstream.dstream(), jfunc) jrdd_serializer = self._jrdd_deserializer if keepSerializer else self._sc.serializer return DStream(dstream.asJavaDStream(), self._ssc, jrdd_serializer)
def foreachRDD( self: "DStream[T]", func: Union[Callable[[RDD[T]], None], Callable[[datetime, RDD[T]], None]], ) -> None: """ Apply a function to each RDD in this DStream. """ if func.__code__.co_argcount == 1: old_func = func def func(_: datetime, rdd: "RDD[T]") -> None: return old_func(rdd) # type: ignore[call-arg, arg-type] jfunc = TransformFunction(self._sc, func, self._jrdd_deserializer) assert self._ssc._jvm is not None api = self._ssc._jvm.PythonDStream api.callForeachRDD(self._jdstream, jfunc)
def transform(self, dstreams: List["DStream[Any]"], transformFunc: Callable[..., RDD[T]]) -> "DStream[T]": """ Create a new DStream in which each RDD is generated by applying a function on RDDs of the DStreams. The order of the JavaRDDs in the transform function parameter will be the same as the order of corresponding DStreams in the list. """ jdstreams = [d._jdstream for d in dstreams] # type: ignore[attr-defined] # change the final serializer to sc.serializer func = TransformFunction( self._sc, lambda t, *rdds: transformFunc(rdds), *[d._jrdd_deserializer for d in dstreams], # type: ignore[attr-defined] ) assert self._jvm is not None jfunc = self._jvm.TransformFunction(func) jdstream = self._jssc.transform(jdstreams, jfunc) return DStream(jdstream, self, self._sc.serializer)
def reduceByKeyAndWindow( self: "DStream[Tuple[K, V]]", func: Callable[[V, V], V], invFunc: Optional[Callable[[V, V], V]], windowDuration: int, slideDuration: Optional[int] = None, numPartitions: Optional[int] = None, filterFunc: Optional[Callable[[Tuple[K, V]], bool]] = None, ) -> "DStream[Tuple[K, V]]": """ Return a new DStream by applying incremental `reduceByKey` over a sliding window. The reduced value of over a new window is calculated using the old window's reduce value : 1. reduce the new values that entered the window (e.g., adding new counts) 2. "inverse reduce" the old values that left the window (e.g., subtracting old counts) `invFunc` can be None, then it will reduce all the RDDs in window, could be slower than having `invFunc`. Parameters ---------- func : function associative and commutative reduce function invFunc : function inverse function of `reduceFunc` windowDuration : int width of the window; must be a multiple of this DStream's batching interval slideDuration : int, optional sliding interval of the window (i.e., the interval after which the new DStream will generate RDDs); must be a multiple of this DStream's batching interval numPartitions : int, optional number of partitions of each RDD in the new DStream. filterFunc : function, optional function to filter expired key-value pairs; only pairs that satisfy the function are retained set this to null if you do not want to filter """ self._validate_window_param(windowDuration, slideDuration) if numPartitions is None: numPartitions = self._sc.defaultParallelism reduced = self.reduceByKey(func, numPartitions) if invFunc: def reduceFunc(t: datetime, a: Any, b: Any) -> Any: b = b.reduceByKey(func, numPartitions) r = a.union(b).reduceByKey(func, numPartitions) if a else b if filterFunc: r = r.filter(filterFunc) return r def invReduceFunc(t: datetime, a: Any, b: Any) -> Any: b = b.reduceByKey(func, numPartitions) joined = a.leftOuterJoin(b, numPartitions) return joined.mapValues( lambda kv: invFunc(kv[0], kv[1]) # type: ignore[misc] if kv[1] is not None else kv[0]) jreduceFunc = TransformFunction(self._sc, reduceFunc, reduced._jrdd_deserializer) jinvReduceFunc = TransformFunction(self._sc, invReduceFunc, reduced._jrdd_deserializer) if slideDuration is None: slideDuration = self._slideDuration assert self._sc._jvm is not None dstream = self._sc._jvm.PythonReducedWindowedDStream( reduced._jdstream.dstream(), jreduceFunc, jinvReduceFunc, self._ssc._jduration(windowDuration), self._ssc._jduration(slideDuration), # type: ignore[arg-type] ) return DStream(dstream.asJavaDStream(), self._ssc, self._sc.serializer) else: return reduced.window(windowDuration, slideDuration).reduceByKey( func, numPartitions # type: ignore[arg-type] )