def popular_taxi_vendor(): env = StreamExecutionEnvironment.get_execution_environment() env.set_parallelism(1) t_env = StreamTableEnvironment.create(stream_execution_environment=env) t_env.execute_sql( create_table_ddl( "WATERMARK FOR pickupTime AS pickupTime - INTERVAL '30' SECONDS")) taxi_ride = t_env.from_path('TaxiRide') popular_rides = taxi_ride.select(taxi_ride.vendorId, taxi_ride.pickupTime) \ .window(Slide.over('15.minutes').every('5.minutes').on(taxi_ride.pickupTime).alias('w')) \ .group_by(taxi_ride.vendorId, col('w')) \ .select(taxi_ride.vendorId, \ col('w').start.alias('start'), \ col('w').end.alias('end'), \ taxi_ride.vendorId.count.alias('cnt')) t_env.to_append_stream( popular_rides, Types.ROW_NAMED(['vendorId', 'start', 'end', 'cnt'], [ Types.INT(), Types.SQL_TIMESTAMP(), Types.SQL_TIMESTAMP(), Types.LONG() ])).print() env.execute('Popular-Taxi-Vendor')
def max_travellers_per_destination(): env = StreamExecutionEnvironment.get_execution_environment() t_env = StreamTableEnvironment.create(stream_execution_environment=env) t_env.execute_sql( create_table_ddl( "WATERMARK FOR dropOffTime AS dropOffTime - INTERVAL '30' SECONDS") ) taxi_ride = t_env.from_path('TaxiRide') no_of_travelers_per_dest = taxi_ride \ .select(taxi_ride.passengerCount, taxi_ride.dropOffTime, taxi_ride.destLocationZone) \ .window(Tumble().over('1.hour').on(taxi_ride.dropOffTime).alias('w')) \ .group_by(taxi_ride.destLocationZone, col('w')) \ .select(taxi_ride.destLocationZone, \ col('w').start.alias('start'), \ col('w').end.alias('end'), \ taxi_ride.passengerCount.count.alias('cnt')) t_env.to_append_stream( no_of_travelers_per_dest, Types.ROW_NAMED(['destLocationZone', 'start', 'end', 'cnt'], [ Types.STRING(), Types.SQL_TIMESTAMP(), Types.SQL_TIMESTAMP(), Types.LONG() ])).print() env.execute('Max-Travellers-Per-Destination')
def setUp(self): super(PyFlinkBlinkStreamTableTestCase, self).setUp() self.env = StreamExecutionEnvironment.get_execution_environment() self.env.set_parallelism(2) self.t_env = StreamTableEnvironment.create( self.env, environment_settings=EnvironmentSettings.new_instance( ).in_streaming_mode().use_blink_planner().build())
def setUp(self): super(PyFlinkBlinkStreamTableTestCase, self).setUp() self.env = StreamExecutionEnvironment.get_execution_environment() self.env.set_parallelism(2) self.t_env = StreamTableEnvironment.create( self.env, environment_settings=EnvironmentSettings.new_instance() .in_streaming_mode().use_blink_planner().build()) self.t_env.get_config().get_configuration().set_string( "taskmanager.memory.task.off-heap.size", "80mb")
def setUp(self): super(PyFlinkBlinkStreamTableTestCase, self).setUp() self.env = StreamExecutionEnvironment.get_execution_environment() self.env.set_parallelism(2) self.t_env = StreamTableEnvironment.create( self.env, environment_settings=EnvironmentSettings.new_instance() .in_streaming_mode().use_blink_planner().build()) self.t_env.get_config().get_configuration().set_string( "python.fn-execution.bundle.size", "1")
def get_stream_table_environment(self) -> StreamTableEnvironment: """ Get the StreamTableEnvironment. If the StreamTableEnvironment has not been set, it initial the StreamTableEnvironment with default Configuration. :return: the StreamTableEnvironment. .. versionadded:: 1.11.0 """ if self._stream_tab_env is None: self._stream_tab_env = StreamTableEnvironment.create( StreamExecutionEnvironment.get_execution_environment()) return self._stream_tab_env
def test_create_table_environment(self): table_config = TableConfig() table_config.set_max_generated_code_length(32000) table_config.set_null_check(False) table_config.set_timezone("Asia/Shanghai") env = StreamExecutionEnvironment.get_execution_environment() t_env = StreamTableEnvironment.create(env, table_config) readed_table_config = t_env.get_config() self.assertFalse(readed_table_config.get_null_check()) self.assertEqual(readed_table_config.get_max_generated_code_length(), 32000) self.assertEqual(readed_table_config.get_timezone(), "Asia/Shanghai")
def popular_destination_query(): env = StreamExecutionEnvironment.get_execution_environment() t_env = StreamTableEnvironment.create(stream_execution_environment=env) t_env.execute_sql( create_table_ddl( "WATERMARK FOR pickupTime AS pickupTime - INTERVAL '30' SECONDS")) query = f"""SELECT destLocationId, wstart, wend, cnt FROM (SELECT destLocationId, HOP_START(pickupTime, INTERVAL '5' MINUTE, INTERVAL '15' MINUTE) AS wstart, HOP_END(pickupTime, INTERVAL '5' MINUTE, INTERVAL '15' MINUTE) AS wend, COUNT(destLocationId) AS cnt FROM (SELECT pickupTime, destLocationId FROM TaxiRide) GROUP BY destLocationId, HOP(pickupTime, INTERVAL '5' MINUTE, INTERVAL '15' MINUTE) ) WHERE cnt > {args.threshold} """ results = t_env.sql_query(query) t_env.to_append_stream( results, Types.ROW_NAMED(['destLocationId', 'wstart', 'wend', 'cnt'], [ Types.INT(), Types.SQL_TIMESTAMP(), Types.SQL_TIMESTAMP(), Types.LONG() ])).print() env.execute('Popular-Destination')