def test_addWindows(self): from ts.flint import windows from pyspark.sql import Row vol = self.vol() VolRow = Row('time', 'id', 'volume') id = [VolRow(int(r['time'].strftime('%s')), r['id'], r['volume']) for r in vol.collect()] expected_pdf = make_pdf([ (1000, 7, 100, [id[0], id[1]]), (1000, 3, 200, [id[0], id[1]]), (1050, 3, 300, [id[0], id[1], id[2], id[3]]), (1050, 7, 400, [id[0], id[1], id[2], id[3]]), (1100, 3, 500, [id[2], id[3], id[4], id[5]]), (1100, 7, 600, [id[2], id[3], id[4], id[5]]), (1150, 3, 700, [id[4], id[5], id[6], id[7]]), (1150, 7, 800, [id[4], id[5], id[6], id[7]]), (1200, 3, 900, [id[6], id[7], id[8], id[9]]), (1200, 7, 1000, [id[6], id[7], id[8], id[9]]), (1250, 3, 1100, [id[8], id[9], id[10], id[11]]), (1250, 7, 1200, [id[8], id[9], id[10], id[11]]), ], ["time", "id", "volume", "window_past_50s"]) new_pdf = vol.addWindows(windows.past_absolute_time("50s")).toPandas() assert_same(new_pdf, expected_pdf)
def test_addWindows(self): from ts.flint import windows from pyspark.sql import Row vol = self.vol() VolRow = Row('time', 'id', 'volume') id = [ VolRow(int(r['time'].strftime('%s')), r['id'], r['volume']) for r in vol.collect() ] expected_pdf = make_pdf([ (1000, 7, 100, [id[0], id[1]]), (1000, 3, 200, [id[0], id[1]]), (1050, 3, 300, [id[0], id[1], id[2], id[3]]), (1050, 7, 400, [id[0], id[1], id[2], id[3]]), (1100, 3, 500, [id[2], id[3], id[4], id[5]]), (1100, 7, 600, [id[2], id[3], id[4], id[5]]), (1150, 3, 700, [id[4], id[5], id[6], id[7]]), (1150, 7, 800, [id[4], id[5], id[6], id[7]]), (1200, 3, 900, [id[6], id[7], id[8], id[9]]), (1200, 7, 1000, [id[6], id[7], id[8], id[9]]), (1250, 3, 1100, [id[8], id[9], id[10], id[11]]), (1250, 7, 1200, [id[8], id[9], id[10], id[11]]), ], ["time", "id", "volume", "window_past_50s"]) new_pdf = vol.addWindows(windows.past_absolute_time("50s")).toPandas() assert_same(new_pdf, expected_pdf)
def test_summarizeWindows(self): from ts.flint import windows from ts.flint import summarizers vol = self.vol() new_pdf1 = vol.summarizeWindows(windows.past_absolute_time('99ns'), summarizers.sum("volume")).toPandas() expected_pdf1 = test_utils.make_pdf([ (1000, 7, 100, 300.0), (1000, 3, 200, 300.0), (1050, 3, 300, 1000.0), (1050, 7, 400, 1000.0), (1100, 3, 500, 1800.0), (1100, 7, 600, 1800.0), (1150, 3, 700, 2600.0), (1150, 7, 800, 2600.0), (1200, 3, 900, 3400.0), (1200, 7, 1000, 3400.0), (1250, 3, 1100, 4200.0), (1250, 7, 1200, 4200.0), ], ["time", "id", "volume", "volume_sum"]) test_utils.assert_same(new_pdf1, expected_pdf1) new_pdf2 = (vol.summarizeWindows(windows.past_absolute_time('99ns'), summarizers.sum("volume"), key="id").toPandas()) expected_pdf2 = test_utils.make_pdf([ (1000, 7, 100, 100.0), (1000, 3, 200, 200.0), (1050, 3, 300, 500.0), (1050, 7, 400, 500.0), (1100, 3, 500, 800.0), (1100, 7, 600, 1000.0), (1150, 3, 700, 1200.0), (1150, 7, 800, 1400.0), (1200, 3, 900, 1600.0), (1200, 7, 1000, 1800.0), (1250, 3, 1100, 2000.0), (1250, 7, 1200, 2200.0), ], ["time", "id", "volume", "volume_sum"]) test_utils.assert_same(new_pdf2, expected_pdf2)
def test_summarizeWindows_udf(self): from ts.flint import udf from ts.flint import windows from collections import OrderedDict from pyspark.sql.types import DoubleType, LongType vol = self.vol() w = windows.past_absolute_time('99s') @udf(DoubleType()) def mean(v): return v.mean() result7 = vol.summarizeWindows( w, {'mean': mean(vol['volume'])}, key='id' ).toPandas() expected7 = make_pdf([ (1000, 7, 100, 100.0), (1000, 3, 200, 200.0), (1050, 3, 300, 250.0), (1050, 7, 400, 250.0), (1100, 3, 500, 400.0), (1100, 7, 600, 500.0), (1150, 3, 700, 600.0), (1150, 7, 800, 700.0), (1200, 3, 900, 800.0), (1200, 7, 1000, 900.0), (1250, 3, 1100, 1000.0), (1250, 7, 1200, 1100.0), ], ['time', 'id', 'volume', 'mean']) assert_same(result7, expected7) result8 = vol.summarizeWindows( w, {'mean': mean(vol['volume'])} ).toPandas() expected8 = make_pdf([ (1000, 7, 100, 150.0), (1000, 3, 200, 150.0), (1050, 3, 300, 250.0), (1050, 7, 400, 250.0), (1100, 3, 500, 450.0), (1100, 7, 600, 450.0), (1150, 3, 700, 650.0), (1150, 7, 800, 650.0), (1200, 3, 900, 850.0), (1200, 7, 1000, 850.0), (1250, 3, 1100, 1050.0), (1250, 7, 1200, 1050.0), ], ['time', 'id', 'volume', 'mean']) assert_same(result8, expected8)
def test_summarizeWindows_udf(self): from ts.flint import udf from ts.flint import windows from collections import OrderedDict from pyspark.sql.types import DoubleType, LongType vol = self.vol() w = windows.past_absolute_time('99s') @udf(DoubleType()) def mean(v): return v.mean() result7 = vol.summarizeWindows(w, { 'mean': mean(vol['volume']) }, key='id').toPandas() expected7 = make_pdf([ (1000, 7, 100, 100.0), (1000, 3, 200, 200.0), (1050, 3, 300, 250.0), (1050, 7, 400, 250.0), (1100, 3, 500, 400.0), (1100, 7, 600, 500.0), (1150, 3, 700, 600.0), (1150, 7, 800, 700.0), (1200, 3, 900, 800.0), (1200, 7, 1000, 900.0), (1250, 3, 1100, 1000.0), (1250, 7, 1200, 1100.0), ], ['time', 'id', 'volume', 'mean']) assert_same(result7, expected7) result8 = vol.summarizeWindows(w, { 'mean': mean(vol['volume']) }).toPandas() expected8 = make_pdf([ (1000, 7, 100, 150.0), (1000, 3, 200, 150.0), (1050, 3, 300, 250.0), (1050, 7, 400, 250.0), (1100, 3, 500, 450.0), (1100, 7, 600, 450.0), (1150, 3, 700, 650.0), (1150, 7, 800, 650.0), (1200, 3, 900, 850.0), (1200, 7, 1000, 850.0), (1250, 3, 1100, 1050.0), (1250, 7, 1200, 1050.0), ], ['time', 'id', 'volume', 'mean']) assert_same(result8, expected8)
def test_summarizeWindows(self): from ts.flint import windows from ts.flint import summarizers vol = self.vol() w = windows.past_absolute_time('99s') new_pdf1 = vol.summarizeWindows(w, summarizers.sum("volume")).toPandas() expected_pdf1 = make_pdf([ (1000, 7, 100, 300.0), (1000, 3, 200, 300.0), (1050, 3, 300, 1000.0), (1050, 7, 400, 1000.0), (1100, 3, 500, 1800.0), (1100, 7, 600, 1800.0), (1150, 3, 700, 2600.0), (1150, 7, 800, 2600.0), (1200, 3, 900, 3400.0), (1200, 7, 1000, 3400.0), (1250, 3, 1100, 4200.0), (1250, 7, 1200, 4200.0), ], ["time", "id", "volume", "volume_sum"]) assert_same(new_pdf1, expected_pdf1) new_pdf2 = (vol.summarizeWindows(w, summarizers.sum("volume"), key="id").toPandas()) expected_pdf2 = make_pdf([ (1000, 7, 100, 100.0), (1000, 3, 200, 200.0), (1050, 3, 300, 500.0), (1050, 7, 400, 500.0), (1100, 3, 500, 800.0), (1100, 7, 600, 1000.0), (1150, 3, 700, 1200.0), (1150, 7, 800, 1400.0), (1200, 3, 900, 1600.0), (1200, 7, 1000, 1800.0), (1250, 3, 1100, 2000.0), (1250, 7, 1200, 2200.0), ], ["time", "id", "volume", "volume_sum"]) assert_same(new_pdf2, expected_pdf2)
def test_addWindows(self): from ts.flint import windows vol = self.vol() id = vol.collect() expected_pdf = test_utils.make_pdf([ (1000, 7, 100, [id[0], id[1]]), (1000, 3, 200, [id[0], id[1]]), (1050, 3, 300, [id[0], id[1], id[2], id[3]]), (1050, 7, 400, [id[0], id[1], id[2], id[3]]), (1100, 3, 500, [id[2], id[3], id[4], id[5]]), (1100, 7, 600, [id[2], id[3], id[4], id[5]]), (1150, 3, 700, [id[4], id[5], id[6], id[7]]), (1150, 7, 800, [id[4], id[5], id[6], id[7]]), (1200, 3, 900, [id[6], id[7], id[8], id[9]]), (1200, 7, 1000, [id[6], id[7], id[8], id[9]]), (1250, 3, 1100, [id[8], id[9], id[10], id[11]]), (1250, 7, 1200, [id[8], id[9], id[10], id[11]]), ], ["time", "id", "volume", "window_past_50ns"]) new_pdf = vol.addWindows(windows.past_absolute_time("50ns")).toPandas() test_utils.assert_same(new_pdf, expected_pdf)
def test_summarizeWindows_numpy_udf(self): from ts.flint import windows from ts.flint.functions import udf from pyspark.sql.types import DoubleType, LongType vol = self.vol() df = self.flintContext.read.pandas( make_pdf([ (1000, 3, 10.0), (1000, 7, 20.0), (1050, 3, 30.0), (1050, 7, 40.0), (1100, 3, 50.0), (1150, 3, 60.0), (1150, 7, 70.0), (1200, 3, 80.0), (1200, 7, 90.0), (1250, 7, 100.0), ], ['time', 'id', 'v'])) @udf(DoubleType(), arg_type='numpy') def mean_np(v): assert isinstance(v, np.ndarray) return v.mean() @udf((DoubleType(), LongType()), arg_type='numpy') def mean_and_sum_np(v): assert isinstance(v, np.ndarray) return v.mean(), v.sum() @udf(DoubleType(), arg_type='numpy') def mean_np_df(window): assert isinstance(window, list) assert isinstance(window[-1], np.ndarray) return window[-1].mean() @udf(DoubleType(), arg_type='numpy') def mean_np_2(v, window): assert isinstance(v, np.float64) assert isinstance(window, list) assert isinstance(window[-1], np.ndarray) return v + window[-1].mean() @udf(DoubleType(), arg_type='numpy') def mean_np_df_2(left, window): assert isinstance(left, list) assert isinstance(left[0], np.float64) assert isinstance(window, list) assert isinstance(window[-1], np.ndarray) return window[-1].mean() w = windows.past_absolute_time('99s') result1 = vol.summarizeWindows(w, { 'mean': mean_np(vol['volume']) }).toPandas() expected1 = make_pdf([ (1000, 7, 100, 150.0), (1000, 3, 200, 150.0), (1050, 3, 300, 250.0), (1050, 7, 400, 250.0), (1100, 3, 500, 450.0), (1100, 7, 600, 450.0), (1150, 3, 700, 650.0), (1150, 7, 800, 650.0), (1200, 3, 900, 850.0), (1200, 7, 1000, 850.0), (1250, 3, 1100, 1050.0), (1250, 7, 1200, 1050.0), ], ['time', 'id', 'volume', 'mean']) assert_same(result1, expected1) result2 = vol.summarizeWindows(w, { 'mean': mean_np(vol['volume']) }, key='id').toPandas() expected2 = make_pdf([ (1000, 7, 100, 100.0), (1000, 3, 200, 200.0), (1050, 3, 300, 250.0), (1050, 7, 400, 250.0), (1100, 3, 500, 400.0), (1100, 7, 600, 500.0), (1150, 3, 700, 600.0), (1150, 7, 800, 700.0), (1200, 3, 900, 800.0), (1200, 7, 1000, 900.0), (1250, 3, 1100, 1000.0), (1250, 7, 1200, 1100.0), ], ['time', 'id', 'volume', 'mean']) assert_same(result2, expected2) result3 = vol.summarizeWindows( w, { 'mean': mean_np_df(vol[['volume']]) }, ).toPandas() expected3 = expected1 assert_same(result3, expected3) result4 = vol.summarizeWindows( w, { 'mean': mean_np_df(vol[['time', 'volume']]) }, ).toPandas() expected4 = expected1 assert_same(result4, expected4) result5 = df.summarizeWindows( w, { 'mean': mean_np_2(df['v'], vol[['time', 'volume']]) }, other=vol, key='id').toPandas() expected5 = make_pdf([ (1000, 3, 10.0, 210.0), (1000, 7, 20.0, 120.0), (1050, 3, 30.0, 280.0), (1050, 7, 40.0, 290.0), (1100, 3, 50.0, 450.0), (1150, 3, 60.0, 660.0), (1150, 7, 70.0, 770.0), (1200, 3, 80.0, 880.0), (1200, 7, 90.0, 990.0), (1250, 7, 100.0, 1200.0), ], ['time', 'id', 'v', 'mean']) assert_same(result5, expected5) result6 = df.summarizeWindows( w, { 'mean': mean_np_df_2(df[['v']], vol[['time', 'volume']]) }, other=vol, key='id').toPandas() expected6 = result6 assert_same(result6, expected6) result7 = df.summarizeWindows( w, { 'mean': mean_np_df(vol[['time', 'volume']]) }, other=vol, key='id').toPandas() expected7 = make_pdf([ (1000, 3, 10.0, 200.0), (1000, 7, 20.0, 100.0), (1050, 3, 30.0, 250.0), (1050, 7, 40.0, 250.0), (1100, 3, 50.0, 400.0), (1150, 3, 60.0, 600.0), (1150, 7, 70.0, 700.0), (1200, 3, 80.0, 800.0), (1200, 7, 90.0, 900.0), (1250, 7, 100.0, 1100.0), ], ['time', 'id', 'v', 'mean']) assert_same(result7, expected7) result8 = vol.summarizeWindows( w, { ('mean', 'sum'): mean_and_sum_np(vol['volume']) }, key='id').toPandas() expected8 = make_pdf([ (1000, 7, 100, 100.0, 100), (1000, 3, 200, 200.0, 200), (1050, 3, 300, 250.0, 500), (1050, 7, 400, 250.0, 500), (1100, 3, 500, 400.0, 800), (1100, 7, 600, 500.0, 1000), (1150, 3, 700, 600.0, 1200), (1150, 7, 800, 700.0, 1400), (1200, 3, 900, 800.0, 1600), (1200, 7, 1000, 900.0, 1800), (1250, 3, 1100, 1000.0, 2000), (1250, 7, 1200, 1100.0, 2200), ], ['time', 'id', 'volume', 'mean', 'sum']) assert_same(result8, expected8)
def test_summarizeWindows_udf(self): from ts.flint import udf from ts.flint import windows from collections import OrderedDict from pyspark.sql.types import DoubleType, LongType vol = self.vol() w = windows.past_absolute_time('99s') df = self.flintContext.read.pandas( make_pdf([ (1000, 3, 10.0), (1000, 7, 20.0), (1050, 3, 30.0), (1050, 7, 40.0), (1100, 3, 50.0), (1150, 3, 60.0), (1150, 7, 70.0), (1200, 3, 80.0), (1200, 7, 90.0), (1250, 7, 100.0), ], ['time', 'id', 'v'])) result1 = df.summarizeWindows( w, OrderedDict([('mean', udf(lambda time, window: window.mean(), DoubleType())(df['time'], vol['volume']))]), key="id", other=vol).toPandas() expected1 = make_pdf([ (1000, 3, 10.0, 200.0), (1000, 7, 20.0, 100.0), (1050, 3, 30.0, 250.0), (1050, 7, 40.0, 250.0), (1100, 3, 50.0, 400.0), (1150, 3, 60.0, 600.0), (1150, 7, 70.0, 700.0), (1200, 3, 80.0, 800.0), (1200, 7, 90.0, 900.0), (1250, 7, 100.0, 1100.0), ], ['time', 'id', 'v', 'mean']) assert_same(result1, expected1) result2 = df.summarizeWindows(w, OrderedDict([ ('mean', udf(lambda window: window.mean(), DoubleType())(vol['volume'])) ]), key='id', other=vol).toPandas() expected2 = expected1 assert_same(result2, expected2) result3 = df.summarizeWindows( w, OrderedDict([ ('mean', udf(lambda window: window.mean(), DoubleType())(vol['volume'])), ('count', udf(lambda time, window: len(window), LongType())(df['time'], vol['volume'])) ]), key='id', other=vol).toPandas() expected3 = make_pdf([ (1000, 3, 10.0, 200.0, 1), (1000, 7, 20.0, 100.0, 1), (1050, 3, 30.0, 250.0, 2), (1050, 7, 40.0, 250.0, 2), (1100, 3, 50.0, 400.0, 2), (1150, 3, 60.0, 600.0, 2), (1150, 7, 70.0, 700.0, 2), (1200, 3, 80.0, 800.0, 2), (1200, 7, 90.0, 900.0, 2), (1250, 7, 100.0, 1100.0, 2), ], ['time', 'id', 'v', 'mean', 'count']) assert_same(result3, expected3) @udf('double') def window_udf(time, window): return (time - window.time).mean().seconds + window.volume.mean() result4 = df.summarizeWindows(w, OrderedDict([ ('mean', window_udf(df['time'], vol[['time', 'volume']])), ]), key='id', other=vol).toPandas() expected4 = make_pdf([ (1000, 3, 10.0, 200.0), (1000, 7, 20.0, 100.0), (1050, 3, 30.0, 275.0), (1050, 7, 40.0, 275.0), (1100, 3, 50.0, 425.0), (1150, 3, 60.0, 625.0), (1150, 7, 70.0, 725.0), (1200, 3, 80.0, 825.0), (1200, 7, 90.0, 925.0), (1250, 7, 100.0, 1125.0), ], ['time', 'id', 'v', 'mean']) assert_same(result4, expected4) @udf(DoubleType()) def foo5(row, window): return (row[0] - window.time).mean().seconds + window.volume.mean() result5 = df.summarizeWindows(w, OrderedDict([ ('mean', foo5(df[['time', 'v']], vol[['time', 'volume']])), ]), key='id', other=vol).toPandas() expected5 = expected4 assert_same(result5, expected5) @udf((DoubleType(), LongType())) def mean_and_count(v): return v.mean(), len(v) result6 = df.summarizeWindows(w, OrderedDict([[ ('mean', 'count'), mean_and_count(vol['volume']) ]]), key='id', other=vol).toPandas() expected6 = expected3 assert_same(result6, expected6) @udf(DoubleType()) def mean(v): return v.mean() result7 = vol.summarizeWindows(w, { 'mean': mean(vol['volume']) }, key='id').toPandas() expected7 = make_pdf([ (1000, 7, 100, 100.0), (1000, 3, 200, 200.0), (1050, 3, 300, 250.0), (1050, 7, 400, 250.0), (1100, 3, 500, 400.0), (1100, 7, 600, 500.0), (1150, 3, 700, 600.0), (1150, 7, 800, 700.0), (1200, 3, 900, 800.0), (1200, 7, 1000, 900.0), (1250, 3, 1100, 1000.0), (1250, 7, 1200, 1100.0), ], ['time', 'id', 'volume', 'mean']) assert_same(result7, expected7) result8 = vol.summarizeWindows(w, { 'mean': mean(vol['volume']) }).toPandas() expected8 = make_pdf([ (1000, 7, 100, 150.0), (1000, 3, 200, 150.0), (1050, 3, 300, 250.0), (1050, 7, 400, 250.0), (1100, 3, 500, 450.0), (1100, 7, 600, 450.0), (1150, 3, 700, 650.0), (1150, 7, 800, 650.0), (1200, 3, 900, 850.0), (1200, 7, 1000, 850.0), (1250, 3, 1100, 1050.0), (1250, 7, 1200, 1050.0), ], ['time', 'id', 'volume', 'mean']) assert_same(result8, expected8)
def test_summarizeWindows(self): from ts.flint import windows from ts.flint import summarizers vol = self.vol() w = windows.past_absolute_time('99s') new_pdf1 = vol.summarizeWindows(w, summarizers.sum("volume")).toPandas() expected_pdf1 = make_pdf([ (1000, 7, 100, 300.0), (1000, 3, 200, 300.0), (1050, 3, 300, 1000.0), (1050, 7, 400, 1000.0), (1100, 3, 500, 1800.0), (1100, 7, 600, 1800.0), (1150, 3, 700, 2600.0), (1150, 7, 800, 2600.0), (1200, 3, 900, 3400.0), (1200, 7, 1000, 3400.0), (1250, 3, 1100, 4200.0), (1250, 7, 1200, 4200.0), ], ["time", "id", "volume", "volume_sum"]) assert_same(new_pdf1, expected_pdf1) new_pdf2 = (vol.summarizeWindows(w, summarizers.sum("volume"), key="id").toPandas()) expected_pdf2 = make_pdf([ (1000, 7, 100, 100.0), (1000, 3, 200, 200.0), (1050, 3, 300, 500.0), (1050, 7, 400, 500.0), (1100, 3, 500, 800.0), (1100, 7, 600, 1000.0), (1150, 3, 700, 1200.0), (1150, 7, 800, 1400.0), (1200, 3, 900, 1600.0), (1200, 7, 1000, 1800.0), (1250, 3, 1100, 2000.0), (1250, 7, 1200, 2200.0), ], ["time", "id", "volume", "volume_sum"]) assert_same(new_pdf2, expected_pdf2) interval_with_id = self.flintContext.read.pandas( make_pdf([ (1000, 3), (1000, 7), (1050, 3), (1050, 7), (1100, 3), (1150, 3), (1150, 7), (1200, 3), (1200, 7), (1250, 7), ], ["time", "id"])) new_pdf3 = (interval_with_id.summarizeWindows( w, summarizers.sum("volume"), key="id", other=vol).toPandas()) expected_pdf3 = make_pdf([ (1000, 3, 200.0), (1000, 7, 100.0), (1050, 3, 500.0), (1050, 7, 500.0), (1100, 3, 800.0), (1150, 3, 1200.0), (1150, 7, 1400.0), (1200, 3, 1600.0), (1200, 7, 1800.0), (1250, 7, 2200.0), ], ["time", "id", "volume_sum"]) assert_same(new_pdf3, expected_pdf3)
df_control_previous_day_val = df_control.shiftTime( windows.future_absolute_time('1day')).toDF('time', 'previous_day_val') df_control_previous_wk_val = df_control.shiftTime( windows.future_absolute_time('7day')).toDF('time', 'previous_wk_val') df_control_joined = df_control.leftJoin(df_control_previous_day_val).leftJoin( df_control_previous_wk_val) df_control_joined.show() # COMMAND ---------- from ts.flint import summarizers df_control_decayed_return = df_control_joined.where( "time > '2018-06-15'").summarizeWindows( window=windows.past_absolute_time('42day'), summarizer=summarizers.ewma('previous_wk_val', alpha=0.5)) display(df_control_decayed_return) # COMMAND ---------- from pyspark.ml.regression import LinearRegression from pyspark.ml.feature import VectorAssembler assembler = VectorAssembler( inputCols=["previous_wk_val", "previous_wk_val_ewma"], outputCol="features") output = assembler.transform(df_control_decayed_return).select( 'ACTL_VAL', 'features').toDF('label', 'features')
def test_summarizeWindows_numpy_udf(self): from ts.flint import windows from ts.flint.functions import udf from pyspark.sql.types import DoubleType, LongType vol = self.vol() df = self.flintContext.read.pandas(make_pdf([ (1000, 3, 10.0), (1000, 7, 20.0), (1050, 3, 30.0), (1050, 7, 40.0), (1100, 3, 50.0), (1150, 3, 60.0), (1150, 7, 70.0), (1200, 3, 80.0), (1200, 7, 90.0), (1250, 7, 100.0), ], ['time', 'id', 'v'])) @udf(DoubleType(), arg_type='numpy') def mean_np(v): assert isinstance(v, np.ndarray) return v.mean() @udf((DoubleType(), LongType()), arg_type='numpy') def mean_and_sum_np(v): assert isinstance(v, np.ndarray) return v.mean(), v.sum() @udf(DoubleType(), arg_type='numpy') def mean_np_df(window): assert isinstance(window, list) assert isinstance(window[-1], np.ndarray) return window[-1].mean() @udf(DoubleType(), arg_type='numpy') def mean_np_2(v, window): assert isinstance(v, np.float64) assert isinstance(window, list) assert isinstance(window[-1], np.ndarray) return v + window[-1].mean() @udf(DoubleType(), arg_type='numpy') def mean_np_df_2(left, window): assert isinstance(left, list) assert isinstance(left[0], np.float64) assert isinstance(window, list) assert isinstance(window[-1], np.ndarray) return window[-1].mean() w = windows.past_absolute_time('99s') result1 = vol.summarizeWindows( w, {'mean': mean_np(vol['volume'])} ).toPandas() expected1 = make_pdf([ (1000, 7, 100, 150.0), (1000, 3, 200, 150.0), (1050, 3, 300, 250.0), (1050, 7, 400, 250.0), (1100, 3, 500, 450.0), (1100, 7, 600, 450.0), (1150, 3, 700, 650.0), (1150, 7, 800, 650.0), (1200, 3, 900, 850.0), (1200, 7, 1000, 850.0), (1250, 3, 1100, 1050.0), (1250, 7, 1200, 1050.0), ], ['time', 'id', 'volume', 'mean']) assert_same(result1, expected1) result2 = vol.summarizeWindows( w, {'mean': mean_np(vol['volume'])}, key = 'id' ).toPandas() expected2 = make_pdf([ (1000, 7, 100, 100.0), (1000, 3, 200, 200.0), (1050, 3, 300, 250.0), (1050, 7, 400, 250.0), (1100, 3, 500, 400.0), (1100, 7, 600, 500.0), (1150, 3, 700, 600.0), (1150, 7, 800, 700.0), (1200, 3, 900, 800.0), (1200, 7, 1000, 900.0), (1250, 3, 1100, 1000.0), (1250, 7, 1200, 1100.0), ], ['time', 'id', 'volume', 'mean']) assert_same(result2, expected2) result3 = vol.summarizeWindows( w, {'mean': mean_np_df(vol[['volume']])}, ).toPandas() expected3 = expected1 assert_same(result3, expected3) result4 = vol.summarizeWindows( w, {'mean': mean_np_df(vol[['time', 'volume']])}, ).toPandas() expected4 = expected1 assert_same(result4, expected4) result8 = vol.summarizeWindows( w, {('mean', 'sum'): mean_and_sum_np(vol['volume'])}, key = 'id' ).toPandas() expected8 = make_pdf([ (1000, 7, 100, 100.0, 100), (1000, 3, 200, 200.0, 200), (1050, 3, 300, 250.0, 500), (1050, 7, 400, 250.0, 500), (1100, 3, 500, 400.0, 800), (1100, 7, 600, 500.0, 1000), (1150, 3, 700, 600.0, 1200), (1150, 7, 800, 700.0, 1400), (1200, 3, 900, 800.0, 1600), (1200, 7, 1000, 900.0, 1800), (1250, 3, 1100, 1000.0, 2000), (1250, 7, 1200, 1100.0, 2200), ], ['time', 'id', 'volume', 'mean', 'sum']) assert_same(result8, expected8)
def test_summarizeWindows(flintContext, tests_utils, windows, summarizers, vol): new_pdf1 = vol.summarizeWindows(windows.past_absolute_time('99ns'), summarizers.sum("volume")).toPandas() expected_pdf1 = make_pdf([ (1000, 7, 100, 300.0), (1000, 3, 200, 300.0), (1050, 3, 300, 1000.0), (1050, 7, 400, 1000.0), (1100, 3, 500, 1800.0), (1100, 7, 600, 1800.0), (1150, 3, 700, 2600.0), (1150, 7, 800, 2600.0), (1200, 3, 900, 3400.0), (1200, 7, 1000, 3400.0), (1250, 3, 1100, 4200.0), (1250, 7, 1200, 4200.0), ], ["time", "id", "volume", "volume_sum"]) tests_utils.assert_same(new_pdf1, expected_pdf1) new_pdf2 = (vol.summarizeWindows(windows.past_absolute_time('99ns'), summarizers.sum("volume"), key="id").toPandas()) expected_pdf2 = make_pdf([ (1000, 7, 100, 100.0), (1000, 3, 200, 200.0), (1050, 3, 300, 500.0), (1050, 7, 400, 500.0), (1100, 3, 500, 800.0), (1100, 7, 600, 1000.0), (1150, 3, 700, 1200.0), (1150, 7, 800, 1400.0), (1200, 3, 900, 1600.0), (1200, 7, 1000, 1800.0), (1250, 3, 1100, 2000.0), (1250, 7, 1200, 2200.0), ], ["time", "id", "volume", "volume_sum"]) tests_utils.assert_same(new_pdf2, expected_pdf2) interval_with_id = flintContext.read.pandas( make_pdf([ (1000, 3), (1000, 7), (1050, 3), (1050, 7), (1100, 3), (1150, 3), (1150, 7), (1200, 3), (1200, 7), (1250, 7), ], ["time", "id"])) new_pdf3 = (interval_with_id.summarizeWindows( windows.past_absolute_time('99ns'), summarizers.sum("volume"), key="id", other=vol).toPandas()) expected_pdf3 = make_pdf([ (1000, 3, 200.0), (1000, 7, 100.0), (1050, 3, 500.0), (1050, 7, 500.0), (1100, 3, 800.0), (1150, 3, 1200.0), (1150, 7, 1400.0), (1200, 3, 1600.0), (1200, 7, 1800.0), (1250, 7, 2200.0), ], ["time", "id", "volume_sum"]) tests_utils.assert_same(new_pdf3, expected_pdf3)
sp500_previous_day_return = sp500_return.shiftTime(windows.future_absolute_time('1day')).toDF('time', 'previous_day_return') sp500_joined_return = sp500_return.leftJoin(sp500_previous_day_return) sp500_joined_return.show() # COMMAND ---------- sp500_joined_return = sp500_return.leftJoin(sp500_previous_day_return, tolerance='3days').dropna() sp500_joined_return.show() # COMMAND ---------- from ts.flint import summarizers sp500_decayed_return = sp500_joined_return.summarizeWindows( window = windows.past_absolute_time('7day'), summarizer = summarizers.ewma('previous_day_return', alpha=0.5) ) sp500_decayed_return.show() # COMMAND ---------- from ts.flint import udf import numpy as np @udf('double', arg_type='numpy') def decayed(columns): v = columns[0] decay = np.power(0.5, np.arange(len(v)))[::-1] return (v * decay).sum()