def test_sw_sax_limit_constant(self): """ Test sliding window and SAX on a constant timeseries with two greater values """ sax_info = ConfigSax(paa=10, sequences_size=10, with_mean=True, with_std=True, global_norm=False, local_norm=False, linear_filter=False, recovery=0.5, coefficients=[0.1, 0.9], alphabet_size=5) spark_ctx = ScManager.get() result, _ = sliding_windows(ts_list=["specific_time_serie"], sax_info=sax_info, spark_ctx=spark_ctx) print("result={}".format(result.collect())) sax_result = run_sax_on_sequences(rdd_sequences_data=result, paa=sax_info.paa, alphabet_size=sax_info.alphabet_size) print("sax_word={}".format(sax_result.sax_word)) # PAA_value = 0 => 'c' # PAA_value = 10 => 'e' or 'd' # PAA_value = -10 => 'a' or 'b' self.assertTrue(sax_result.sax_word is 'ccccccccae' or sax_result.sax_word is 'ccccccccbd')
def test_sliding_window_sax_basic(self): """ Test the nominal case """ sax_info = ConfigSax(paa=3, sequences_size=6, with_mean=True, with_std=True, global_norm=False, local_norm=False, linear_filter=False, recovery=0.5, coefficients=[0.1, 0.9], alphabet_size=3) spark_ctx = ScManager.get() result, _ = sliding_windows(ts_list=["linear_time_serie"], sax_info=sax_info, spark_ctx=spark_ctx) sax_result = run_sax_on_sequences(rdd_sequences_data=result, paa=sax_info.paa, alphabet_size=sax_info.alphabet_size) # recovery = 0.5 and word_size = 3 => sax_result = 'aab abc bcc' self.assertEqual(sax_result.sax_word, 'aababcbcc')
def test_sliding_window_filter(self): """ Testing linear filter. """ sax_info = ConfigSax(paa=3, sequences_size=6, with_mean=True, with_std=True, global_norm=False, local_norm=False, linear_filter=True, recovery=0.5, coefficients=[1, 0.5], alphabet_size=6) spark_ctx = ScManager.get() # Test for linear sequences result, _ = sliding_windows(ts_list=["linear_time_serie"], sax_info=sax_info, spark_ctx=spark_ctx) result = result.collect() # all sequences are linear => no sequence self.assertEqual(len(result), 0) # Test for constant sequences with a maximum recovery (= 0 => no overlap between sequences) sax_info.coefficients = [0, 1] sax_info.recovery = 0 result, _ = sliding_windows(ts_list=["ts_with_constant_pattern"], sax_info=sax_info, spark_ctx=spark_ctx) result = result.collect() LOGGER.info("result=%s", result) LOGGER.info("ts_init=%s", get_ts_mock("ts_with_constant_pattern")) # Sequence of 12 pts, recovery = 0 (no recovery) -> 2 sequences self.assertEqual(len(result), 2)
def test_patterns(self): """ Test the 'main' function of random projection. Setting all the param is quite long... """ sax_info = ConfigSax(paa=4, sequences_size=4, with_mean=True, with_std=True, global_norm=True, local_norm=True, linear_filter=True, recovery=1, coefficients=[0.1, 0.9], alphabet_size=4) ts_list = [{'tsuid': 'testPatternA'}, {'tsuid': 'tesPatternB'}, {'tsuid': 'testPatternC'}, {'tsuid': 'testPatternConstant'}, {'tsuid': 'testPatternLinear'}, {'tsuid': 'testPatternTooSmall'}, {'tsuid': 'testPatternTrivialMatch'}, {'tsuid': 'testPatternRealistic'}] # get all the tsuid (necessary for the format of the result) tsuid_list = [] for ts_ref in ts_list: tsuid_list.append(ts_ref['tsuid']) COLLISION_INFO.nb_iterations = 10 # set the recognition_info.min_value as done in 'main_random_projection' max_iterations = binom(sax_info.paa, COLLISION_INFO.index) RECOGNITION_INFO.min_value = int(0.05 * max_iterations) RECOGNITION_INFO.activate_spark = None # We have set the values result = random_projections(ts_list=tsuid_list, sax_info=sax_info, collision_info=COLLISION_INFO, recognition_info=RECOGNITION_INFO) LOGGER.info(len(result["patterns"])) self.assertTrue(len(result["patterns"]) > 0)
def test_sax(self): """ Test with no calculate the PAA (4 PAA for 4 points in a sequence) and the PAA are equidistants """ sax_info = ConfigSax(paa=4, sequences_size=4, with_mean=True, with_std=True, global_norm=False, local_norm=False, linear_filter=False, recovery=0.5, coefficients=[0.1, 0.9], alphabet_size=4) spark_ctx = ScManager.get() result, _ = sliding_windows( ts_list=["simple_sequences_ts0", "simple_sequences_ts1"], sax_info=sax_info, spark_ctx=spark_ctx) LOGGER.info("sliding_windows done!") sax_result = run_sax_on_sequences(rdd_sequences_data=result, paa=sax_info.paa, alphabet_size=sax_info.alphabet_size) result = result.collect() LOGGER.info("sax_result=%s", sax_result) LOGGER.info("result=%s", result) # the PAA : [[4, 4, 0, 2], [-2, 2, -2, 0]] self.assertEqual(sax_result.paa.collect(), [4, 4, 0, 2, -2, 2, -2, 0]) # the result expected : 'ddbc acab' self.assertEqual(sax_result.sax_word, 'ddbcacab') # Test with calculate the PAA sax_info = ConfigSax(paa=4, sequences_size=12, with_mean=True, with_std=True, global_norm=False, local_norm=False, linear_filter=False, recovery=0.5, coefficients=[0.1, 0.9], alphabet_size=4) result, _ = sliding_windows( ts_list=["sequences_1_ts0", "sequences_1_ts1"], sax_info=sax_info, spark_ctx=spark_ctx) sax_result = run_sax_on_sequences(rdd_sequences_data=result, paa=sax_info.paa, alphabet_size=sax_info.alphabet_size) # the PAA : [[1, 4, -2, 1], [4, -2, -3, -3]] self.assertEqual(sax_result.paa.collect(), [1, 4, -2, 1, 4, -2, -3, -3]) # the result expected : 'cdbc dbaa' self.assertEqual(sax_result.sax_word, 'cdbcdbaa')
def test_sliding_window_norm(self): """ Testing global and local norm. """ epsilon = 1.0e-10 # recovery = 1 (no recovery) -> 3 seq of 4 points (nb_points = 12) sax_info = ConfigSax(paa=3, sequences_size=4, with_mean=True, with_std=True, global_norm=True, local_norm=False, linear_filter=False, recovery=0, coefficients=[0.1, 1], alphabet_size=6) spark_ctx = ScManager.get() # Test with global normalization : the timeseries is normalized result, coeff = sliding_windows(ts_list=["linear_time_serie"], sax_info=sax_info, spark_ctx=spark_ctx) result = result.collect() coeff = coeff.collect() # Check coeff : coeff is the mean and variance of each sequence # 12 points no recovery (recovery=0) -> 3 seq of 4 points self.assertEqual(len(coeff), 3) # ts_value is an array with the sequences values ts_value = np.array([]) for i, _ in enumerate(result): # result[i] = (key, list([timestamps, values],[,],...)) ts_value = np.concatenate((result[i][1][:, 1], ts_value)) LOGGER.info("result=%s", result) # no recovery => 2 seq * 6 points = 12 values = npoints self.assertEqual(len(ts_value), 12) LOGGER.info("ts_std=%s", (ts_value.std())) LOGGER.info("ts_mean=%s", np.mean(ts_value)) # global normalisation => ts_value have a standard deviation of 1 and mean if 0 self.assertTrue(1 - epsilon < np.std(ts_value) < 1 + epsilon) self.assertTrue(-epsilon < np.mean(ts_value) < epsilon) # Test with local normalization : all the sequences are normalized sax_info.global_norm = False sax_info.local_norm = True sax_info.linear_filter = True # Recovery = 1 : maximum recovery sax_info.recovery = 1 result, coeff = sliding_windows(ts_list=["ts_with_constant_pattern"], sax_info=sax_info, spark_ctx=spark_ctx) result = result.collect() # Verify that each sequence are normalized for i, _ in enumerate(result): # result[i] = (key, list([timestamps, values],[,],...)) seq_value = result[i][1][:, 1] self.assertTrue(1 - epsilon < np.std(seq_value) < 1 + epsilon) self.assertTrue(-epsilon < np.mean(seq_value) < epsilon)
def test_sliding_window_recovery(self): """ Testing the recovery parameter. """ sax_info = ConfigSax(paa=3, sequences_size=6, with_mean=True, with_std=True, global_norm=False, local_norm=False, linear_filter=False, recovery=0.5, coefficients=[1, 1], alphabet_size=6) ts_name = ["linear_time_serie"] spark_ctx = ScManager.get() # Test with recovery = 0.5 result, _ = sliding_windows(ts_list=ts_name, sax_info=sax_info, spark_ctx=spark_ctx) result = result.collect() # 2 sequences in the timeseries => 3 sequences at the end self.assertEqual(len(result), 3) # Test with MAX recovery # recovery = 1 (the maximum : 100 % <=> the next window start one point to the right) sax_info.recovery = 1.0 result, _ = sliding_windows(ts_list=ts_name, sax_info=sax_info, spark_ctx=spark_ctx) result = result.collect() # remember that in 'sliding_window' function, we call 'get_ts_mock(ts_name)[0]' ts = get_ts_mock(ts_name)[0] ts_val_0 = list(ts[0:6][:, 1]) ts_val_1 = list(ts[6:12][:, 1]) timestamp_0 = list(ts[0:6][:, 0]) timestamp_1 = list(ts[6:12][:, 0]) # Check the timestamp and the values of the two sequences # result[i] = (key, list([timestamps, values],[,],...)) # check ts value condition = (np.all(result[i][1][:, 1] in ts_val_0 for i in range(len(result))) or np.all(result[i][1][:, 1] in ts_val_1 for i in range(len(result)))) self.assertTrue(condition) # check timestamps condition = (np.all(result[i][1][:, 0] in timestamp_0 for i in range(len(result))) or np.all(result[i][1][:, 0] in timestamp_1 for i in range(len(result)))) self.assertTrue(condition) # Test with MINIMUM recovery # recovery = 0 (no recovery) sax_info.recovery = 0.01 result2, _ = sliding_windows(ts_list=ts_name, sax_info=sax_info, spark_ctx=spark_ctx) result2 = result2.collect() # 2 sequences in the timeseries => 2 sequences self.assertEqual(len(result2), 2)
# Add logs to the unittest stdout for the_logger in [SAX_LOGGER, RECOG_LOGGER, COLL_LOGGER, LOGGER, LOGGER]: the_logger.setLevel(logging.DEBUG) formatter = logging.Formatter('%(asctime)s:%(levelname)s:%(funcName)s:%(message)s') # Create another handler that will redirect log entries to STDOUT stream_handler = logging.StreamHandler() stream_handler.setLevel(logging.DEBUG) stream_handler.setFormatter(formatter) the_logger.addHandler(stream_handler) SAX_INFO = ConfigSax(paa=20, sequences_size=1000, with_mean=True, with_std=True, global_norm=False, local_norm=True, linear_filter=True, recovery=0.8, coefficients=[0.1, 0.9], alphabet_size=10) COLLISION_INFO = ConfigCollision(iterations=1, index=2, config_sax=SAX_INFO) # Avoiding spark jobs here: already tested in test_recognition RECOGNITION_INFO = ConfigRecognition(is_stopped_by_eq9=True, is_algo_method_global=True, min_value=1, iterations=10, radius=1.5, neighborhood_method=2, activate_spark=False)