def setUpClass(cls): """ needs environment vars to be explicitly set : SPARK_HOME and PYSPARK_PYTHON """ if os.getenv("PYSPARK_PYTHON") is None: assert "env PYSPARK_PYTHON must be defined" if os.getenv("SPARK_HOME") is None: assert "env SPARK_HOME must be defined" # Create a spark Context ScManager.create()
def test_collision_same_words(self): """ The words are all the same """ sc = ScManager.get() sax_result = SaxResult(paa=sc.parallelize([]), breakpoints=[], sax_word='abcdabcdabcdabcd') sax, _, _ = sax_result.start_sax(4, spark_ctx=sc) sequences_size = np.array(sax.collect()).shape[1] result, _ = final_collision_matrix(sax=sax, number_of_iterations=6, index_selected=2, word_len=sequences_size, spark_ctx=sc) result = result.data # exactly the same words => six cells of maximum of combinations nb_cell = 0 for i in result: if i[0] == 6: nb_cell += 1 self.assertEqual(nb_cell, 6)
def test_sw_sax_limit_constant(self): """ Test sliding window and SAX on a constant timeseries with two greater values """ sax_info = ConfigSax(paa=10, sequences_size=10, with_mean=True, with_std=True, global_norm=False, local_norm=False, linear_filter=False, recovery=0.5, coefficients=[0.1, 0.9], alphabet_size=5) spark_ctx = ScManager.get() result, _ = sliding_windows(ts_list=["specific_time_serie"], sax_info=sax_info, spark_ctx=spark_ctx) print("result={}".format(result.collect())) sax_result = run_sax_on_sequences(rdd_sequences_data=result, paa=sax_info.paa, alphabet_size=sax_info.alphabet_size) print("sax_word={}".format(sax_result.sax_word)) # PAA_value = 0 => 'c' # PAA_value = 10 => 'e' or 'd' # PAA_value = -10 => 'a' or 'b' self.assertTrue(sax_result.sax_word is 'ccccccccae' or sax_result.sax_word is 'ccccccccbd')
def test_sliding_window_sax_basic(self): """ Test the nominal case """ sax_info = ConfigSax(paa=3, sequences_size=6, with_mean=True, with_std=True, global_norm=False, local_norm=False, linear_filter=False, recovery=0.5, coefficients=[0.1, 0.9], alphabet_size=3) spark_ctx = ScManager.get() result, _ = sliding_windows(ts_list=["linear_time_serie"], sax_info=sax_info, spark_ctx=spark_ctx) sax_result = run_sax_on_sequences(rdd_sequences_data=result, paa=sax_info.paa, alphabet_size=sax_info.alphabet_size) # recovery = 0.5 and word_size = 3 => sax_result = 'aab abc bcc' self.assertEqual(sax_result.sax_word, 'aababcbcc')
def test_coll_various_words(self): """ Test the collision matrix for same and different words The words 0 and 3 are the same, the words 1 and 2 too """ nb_paa = 5 nb_index = 2 sc = ScManager.get() sax_result = SaxResult(paa=sc.parallelize([]), breakpoints=[], sax_word=''.join( ['ababa', 'cdcdc', 'cdcdc', 'ababa'])) sax, _, _ = sax_result.start_sax(nb_paa, spark_ctx=sc) sequences_size = np.array(sax.collect()).shape[1] result, _ = final_collision_matrix(sax=sax, number_of_iterations=int( binom(nb_paa, nb_index)), index_selected=nb_index, word_len=sequences_size, spark_ctx=sc) result = result.data result.sort(key=lambda x: "{}-{}-{}".format(int(x[0]), int(x[1][0]), int(x[1][1]))) print(result) # the maximum of possible combinations without repetitions is 10 # two cells of 10 : one for the occurrences between the words 1 and 2, and another for the words 0 and 3 for i in range(2): self.assertTrue(result[i][0] == 10) self.assertTrue( int(result[0][1][0]) == 2 and int(result[0][1][1]) == 1) self.assertTrue( int(result[1][1][0]) == 3 and int(result[1][1][1]) == 0)
def _run_all_in_master_memory(self, method): """ Run the spark pearson correlation by loading all the TS content (ie. values) in master memory Each coefficient will be computed by a worker (Spark decides the best choice to apply) """ # Create or get a spark Context spark_context = ScManager.get() # Get TS content rdd_content = self._get_ts(spark_context) # Job distribution is made by Statistics.corr (Spark correlation matrix calculation) self.results = Statistics.corr(rdd_content, method=method) ScManager.stop()
def _apply_motif_iter_zero_coll(self, activate_spark): """ Test - with the iterative method to search the neighborhood motif, - with/without spark jobs - and where the words are all different => no collisions """ spark_context = ScManager.get() # Build the SAX result with different words, and small breakpoints sax_result = SaxResult(paa=spark_context.parallelize([]), breakpoints=[-0.3, -0.1, 0.1, 0.3], sax_word='abcdebcdeacdeabdeabceabcd') sax, _, nb_seq = sax_result.start_sax(5, spark_ctx=spark_context) # sax is an rdd -> to np.array sax = np.transpose(sax.collect()) breakpoint = sax_result.build_mindist_lookup_table(nb_seq) # Different words => only zero cells in the collision matrix collision_matrix = SparseMatrix(np.zeros((nb_seq, nb_seq))) # Build the class for motif search search_info = NeighborhoodSearch(size_sequence=20, mindist_lookup_table=breakpoint, alphabet_size=5, sax=np.transpose(sax), radius=1000, collision_matrix=collision_matrix) recognition_info = ConfigRecognition( is_stopped_by_eq9=True, iterations=100, min_value=1, is_algo_method_global=False, activate_spark=activate_spark, radius=1000, neighborhood_method=OPT_USING_BRUTE_FORCE) # neighborhood_method=OPT_USING_BRUTE_FORCE result = search_info.motif_neighborhood_iterative(30, recognition_info) # There is no similar sequences self.assertEqual(len(result), 0) # neighborhood_method=OPT_USING_COLLISIONS recognition_info.neighborhood_method = OPT_USING_COLLISIONS result = search_info.motif_neighborhood_iterative(30, recognition_info) # There is no similar sequences self.assertEqual(len(result), 0)
def __init__(self, tdm, ts_load_split_size=10): """ init the spark distance class :param tdm: the temporal data manager client :type tdm: TemporalDataMgr :param ts_load_split_size: size of TS packet to load from TDM :type ts_load_split_size: int """ self.tdm = tdm self.ts_load_split_size = ts_load_split_size self.spark_context = ScManager.get() self.logger = logging.getLogger(__name__)
def test_coll_near_same_words(self): """ The words have 1, or 2, or 3, or 4 occurrences, but there are not exactly the same because words have five letters """ nb_paa = 5 nb_index = 2 sc = ScManager.get() sax_result = SaxResult( paa=sc.parallelize([]), breakpoints=[], sax_word=''.join(['aaaaa', 'abbbb', 'abccc', 'abcdd', 'abcde'])) sax, _, _ = sax_result.start_sax(nb_paa, spark_ctx=sc) sequences_size = np.array(sax.collect()).shape[1] result, _ = final_collision_matrix(sax=sax, number_of_iterations=int( binom(nb_paa, nb_index)), index_selected=nb_index, word_len=sequences_size, spark_ctx=sc) # sorted result list result = result.data result.sort(key=lambda x: "{}-{}-{}".format(int(x[0]), int(x[1][0]), int(x[1][1]))) print(result) # sorted list expected: expected_result = [(1.0, (2, 1)), (1.0, (3, 1)), (3.0, (3, 2)), (1.0, (4, 1)), (3.0, (4, 2)), (6.0, (4, 3))] expected_result.sort(key=lambda x: "{}-{}-{}".format( int(x[0]), int(x[1][0]), int(x[1][1]))) self.assertEqual(len(result), len(expected_result)) for expected_item, res_item in zip(expected_result, result): self.assertEqual(expected_item[0], res_item[0], 'nb collisions') self.assertEqual(expected_item[1][0], res_item[1][0], 'seq index left-side') self.assertEqual(expected_item[1][1], res_item[1][1], 'seq index right-side')
def test_sliding_window_filter(self): """ Testing linear filter. """ sax_info = ConfigSax(paa=3, sequences_size=6, with_mean=True, with_std=True, global_norm=False, local_norm=False, linear_filter=True, recovery=0.5, coefficients=[1, 0.5], alphabet_size=6) spark_ctx = ScManager.get() # Test for linear sequences result, _ = sliding_windows(ts_list=["linear_time_serie"], sax_info=sax_info, spark_ctx=spark_ctx) result = result.collect() # all sequences are linear => no sequence self.assertEqual(len(result), 0) # Test for constant sequences with a maximum recovery (= 0 => no overlap between sequences) sax_info.coefficients = [0, 1] sax_info.recovery = 0 result, _ = sliding_windows(ts_list=["ts_with_constant_pattern"], sax_info=sax_info, spark_ctx=spark_ctx) result = result.collect() LOGGER.info("result=%s", result) LOGGER.info("ts_init=%s", get_ts_mock("ts_with_constant_pattern")) # Sequence of 12 pts, recovery = 0 (no recovery) -> 2 sequences self.assertEqual(len(result), 2)
def test_collision_different_words(self): """ The words are all different """ nb_paa = 5 nb_index = 2 sc = ScManager.get() sax_result = SaxResult( paa=sc.parallelize([]), breakpoints=[], sax_word=''.join(['abcde', 'fghij', 'klmno', 'pqrst', 'uvwxy'])) sax, _, _ = sax_result.start_sax(nb_paa, spark_ctx=sc) sequences_size = np.array(sax.collect()).shape[1] result, _ = final_collision_matrix(sax=sax, number_of_iterations=int( binom(nb_paa, nb_index)), index_selected=nb_index, word_len=sequences_size, spark_ctx=sc) result = result.data # different words => only zero cells in the matrix self.assertTrue(len(result) is 0)
def test_sax(self): """ Test with no calculate the PAA (4 PAA for 4 points in a sequence) and the PAA are equidistants """ sax_info = ConfigSax(paa=4, sequences_size=4, with_mean=True, with_std=True, global_norm=False, local_norm=False, linear_filter=False, recovery=0.5, coefficients=[0.1, 0.9], alphabet_size=4) spark_ctx = ScManager.get() result, _ = sliding_windows( ts_list=["simple_sequences_ts0", "simple_sequences_ts1"], sax_info=sax_info, spark_ctx=spark_ctx) LOGGER.info("sliding_windows done!") sax_result = run_sax_on_sequences(rdd_sequences_data=result, paa=sax_info.paa, alphabet_size=sax_info.alphabet_size) result = result.collect() LOGGER.info("sax_result=%s", sax_result) LOGGER.info("result=%s", result) # the PAA : [[4, 4, 0, 2], [-2, 2, -2, 0]] self.assertEqual(sax_result.paa.collect(), [4, 4, 0, 2, -2, 2, -2, 0]) # the result expected : 'ddbc acab' self.assertEqual(sax_result.sax_word, 'ddbcacab') # Test with calculate the PAA sax_info = ConfigSax(paa=4, sequences_size=12, with_mean=True, with_std=True, global_norm=False, local_norm=False, linear_filter=False, recovery=0.5, coefficients=[0.1, 0.9], alphabet_size=4) result, _ = sliding_windows( ts_list=["sequences_1_ts0", "sequences_1_ts1"], sax_info=sax_info, spark_ctx=spark_ctx) sax_result = run_sax_on_sequences(rdd_sequences_data=result, paa=sax_info.paa, alphabet_size=sax_info.alphabet_size) # the PAA : [[1, 4, -2, 1], [4, -2, -3, -3]] self.assertEqual(sax_result.paa.collect(), [1, 4, -2, 1, 4, -2, -3, -3]) # the result expected : 'cdbc dbaa' self.assertEqual(sax_result.sax_word, 'cdbcdbaa')
def run_sax_from_ts_list(ts_list, alphabet_size, word_size, normalize=False, activate_spark=None): """ Perform the Symbolic Aggregate Approximation (SAX) on the TSUID list provided in **ts_list** Use spark if necessary .. note:: If spark fails. The local computation will be performed :param ts_list: tsuid list of the TS to calculate the PAA timeseries :type ts_list: list :param alphabet_size: number of characters in result word :type alphabet_size: int :param word_size: number of segments :type word_size: int :param activate_spark: True to force spark, False to force local, None to let the algorithm decide :type activate_spark: bool or none :param normalize: Apply the normalization of the TS if True (False:default) :type normalize: bool :return: A list of dict composed of the PAA result, the SAX breakpoints, the SAX string and the points for all TSUID :rtype: list """ results = {} # Define if spark is necessary if activate_spark is None: md = IkatsApi.md.read(ts_list=ts_list) sum_points = 0 for tsuid in md: if 'qual_nb_points' in md[tsuid]: sum_points += float(md[tsuid]['qual_nb_points']) else: # No information on number of points, consider using spark sum_points = 0 break spark_nb_points_trigger = 1E5 if sum_points == 0 or sum_points / len( ts_list) > spark_nb_points_trigger: # Spark is active if the average number of points per TS is greater than spark_nb_points_trigger points activate_spark = True if activate_spark: LOGGER.info("Running SAX using Spark") # Create or get a spark Context spark_context = ScManager.get() # Build the RDD with TSUIDS rdd = spark_context.parallelize(ts_list) # Create a broadcast for spark jobs broadcast = spark_context.broadcast({ "alphabet_size": alphabet_size, "word_size": word_size, "normalize": normalize, }) # Create an accumulator to store the results of the spark workers accumulator = spark_context.accumulator(dict(), ListAccumulatorParam()) def run_sax_spark(working_tsuid): """ Method called by spark job :param working_tsuid: rdd item """ results = run_sax_from_tsuid( tsuid=working_tsuid, alphabet_size=broadcast.value['alphabet_size'], word_size=broadcast.value['word_size'], normalize=broadcast.value['normalize']) accumulator.add({working_tsuid: results}) # Get TS content using spark distribution to increase performance # noinspection PyBroadException try: rdd.foreach(run_sax_spark) except Exception: LOGGER.warning( 'Something wrong with spark, Using Local Computation') activate_spark = False for ts in ts_list: if ts in accumulator.value: results[ts] = accumulator.value[ts] else: LOGGER.warning( "TS %s has encountered an issue during the spark distribution", ts) ScManager.stop() if not activate_spark: LOGGER.info("Running SAX on single instance") for ts in ts_list: results[ts] = run_sax_from_tsuid(tsuid=ts, alphabet_size=alphabet_size, word_size=word_size, normalize=normalize) # print("TS=%s\nnorm=%s\nr=%s\n\n"%(ts,normalize,results[ts]['sax_breakpoints'][0])) return results
def test_sliding_window_norm(self): """ Testing global and local norm. """ epsilon = 1.0e-10 # recovery = 1 (no recovery) -> 3 seq of 4 points (nb_points = 12) sax_info = ConfigSax(paa=3, sequences_size=4, with_mean=True, with_std=True, global_norm=True, local_norm=False, linear_filter=False, recovery=0, coefficients=[0.1, 1], alphabet_size=6) spark_ctx = ScManager.get() # Test with global normalization : the timeseries is normalized result, coeff = sliding_windows(ts_list=["linear_time_serie"], sax_info=sax_info, spark_ctx=spark_ctx) result = result.collect() coeff = coeff.collect() # Check coeff : coeff is the mean and variance of each sequence # 12 points no recovery (recovery=0) -> 3 seq of 4 points self.assertEqual(len(coeff), 3) # ts_value is an array with the sequences values ts_value = np.array([]) for i, _ in enumerate(result): # result[i] = (key, list([timestamps, values],[,],...)) ts_value = np.concatenate((result[i][1][:, 1], ts_value)) LOGGER.info("result=%s", result) # no recovery => 2 seq * 6 points = 12 values = npoints self.assertEqual(len(ts_value), 12) LOGGER.info("ts_std=%s", (ts_value.std())) LOGGER.info("ts_mean=%s", np.mean(ts_value)) # global normalisation => ts_value have a standard deviation of 1 and mean if 0 self.assertTrue(1 - epsilon < np.std(ts_value) < 1 + epsilon) self.assertTrue(-epsilon < np.mean(ts_value) < epsilon) # Test with local normalization : all the sequences are normalized sax_info.global_norm = False sax_info.local_norm = True sax_info.linear_filter = True # Recovery = 1 : maximum recovery sax_info.recovery = 1 result, coeff = sliding_windows(ts_list=["ts_with_constant_pattern"], sax_info=sax_info, spark_ctx=spark_ctx) result = result.collect() # Verify that each sequence are normalized for i, _ in enumerate(result): # result[i] = (key, list([timestamps, values],[,],...)) seq_value = result[i][1][:, 1] self.assertTrue(1 - epsilon < np.std(seq_value) < 1 + epsilon) self.assertTrue(-epsilon < np.mean(seq_value) < epsilon)
def cut_ds_from_metric(ds_name, metric, criteria, group_by=None, fid_pattern=None, chunk_size=75000): """ Entry point of the method that cut a dataset based on the criteria applied to the TS matching the metric The criteria expression is a python expression that will be converted to a lambda expression with 'M' used as metric value. Example: "M > 7 and M not in [1,2,6]" :param ds_name: name of the dataset to use :param metric: metric used as reference to find cut ranges :param criteria: criteria expression describing the value thresholds. :param group_by: metadata to iterate on each value (Default to None to not use this behaviour) :param fid_pattern: name of the generated TS. Variables can be used: - {fid} : Functional identifier - {M} : metric :param chunk_size: Size of the ideal chunk (in number of points per chunk) :type ds_name: str :type metric: str :type criteria: str :type group_by: str or None :type fid_pattern: str :type chunk_size: int :return: the ts list of the generated TS. [{"funcId": "xx", "tsuid":"xx"}] :rtype: list :raises ValueError: if dataset is empty :raises ValueError: if metric is found several times in dataset :raises ValueError: if metric is not found in dataset :raises ValueError: if group_by doesn't have a matching reference :raises KeyError: if error in fid_pattern """ # List of TS present in dataset ts_list = IkatsApi.ds.read(ds_name=ds_name)['ts_list'] if len(ts_list) == 0: LOGGER.error("Dataset %s is empty", ds_name) raise ValueError("Dataset %s is empty" % ds_name) # Get all the metadata md_list = IkatsApi.md.read(ts_list=ts_list) # List of all possible values encountered for the group by groups_list = None if group_by not in [None, ""]: # Get all the groups for this group by criterion groups_list = _find_all_groups(group_by, md_list) LOGGER.info("%s groups found for [%s]", len(groups_list), group_by) else: # Force to None group_by = None # Find the reference TS and all TS to cut using this ref grouped_ts_list = _find_ts_ref_group(ds_name=ds_name, md_list=md_list, metric=metric, ts_list=ts_list, group_by=group_by, group_by_list=groups_list) # Get Spark Context # Important !!!! Use only this method in Ikats to use a spark context spark_context = ScManager.get() try: result = [] # For each group (processed in alphabetic order) for group in sorted(grouped_ts_list): result_iter = _cut_from_metric_for_group( chunk_size=chunk_size, criteria=criteria, ds_name=ds_name, fid_pattern=fid_pattern, md_list=md_list, metric=metric, spark_context=spark_context, group=grouped_ts_list[group]) # Sort functional identifiers alphabetically) result.extend(sorted(result_iter, key=lambda x: x['funcId'])) return result finally: ScManager.stop()
def compute_slope(ts_list, fid_suffix="_slope", chunk_size=75000, save_new_ts=True): """ Compute the slope of a list of timeseries using spark This implementation computes slope for one TS at a time in a loop. To know the details of the computation, see the corresponding method :param ts_list: list of TS. Each item is a dict composed of a TSUID and a functional id :param fid_suffix: Functional identifier suffix of the final timeseries :param chunk_size: Number of points per chunk (assuming the TS is periodic) :param save_new_ts: True (default) if TS must be saved to database :type ts_list: list of dict :type fid_suffix: str :type chunk_size: int :type save_new_ts: bool :return: the new list of derived TS (same order as input) :rtype: list of dict :raise TypeError: if ts_list type is incompatible """ # Check inputs if not isinstance(ts_list, list): raise TypeError("ts_list shall be a list") if len(ts_list) == 0: raise TypeError("ts_list must have at least one element") LOGGER.info('Computing Slope for %s TS', len(ts_list)) tsuid_list = ts_list try: # Extract TSUID from ts_list tsuid_list = [x['tsuid'] for x in ts_list] except Exception: # Already a tsuid_list. # Getting the functional id for each ts ts_list = [{ 'tsuid': x, 'funcId': IkatsApi.fid.read(x) } for x in ts_list] # Gather all metadata for the list of TS to compute slope md_list = IkatsApi.md.read(tsuid_list) # Results will be stored here results = [] try: # Get Spark Context spark_context = ScManager.get() for index, tsuid in enumerate(tsuid_list): fid = [x['funcId'] for x in ts_list if x['tsuid'] == tsuid][0] LOGGER.info('Processing Slope for TS %s (%s/%s) (%s)', fid, (index + 1), len(tsuid_list), tsuid) computed_tsuid, computed_fid = compute_slope_for_tsuid( spark_context=spark_context, fid=fid, fid_suffix=fid_suffix, tsuid=tsuid, md_list=md_list, chunk_size=chunk_size, save_new_ts=save_new_ts) # Append results to final results results.append({"tsuid": computed_tsuid, "funcId": computed_fid}) except Exception: raise finally: # Stop spark context in all cases ScManager.stop() return results
def _resample(resampling_way, ts_list, resampling_period, adding_method=AddingMethod.LINEAR_INTERPOLATION, timestamp_position=TimestampPosition.BEG, aggregation_method=AggregationMethod.AVG, nb_points_by_chunk=50000, generate_metadata=False): """ Function that effectively resamples (UP or DOWN according to resampling_way value) using Spark :param resampling_way: way of resampling (UP or DOWN) :type ts_list: ResamplingWay :param ts_list: list composing the TS information to resample [{'tsuid': xxx, 'funcId': yyy },...] :type ts_list: list of dict :param resampling_period: target period for resampling (in ms) :type resampling_period: int :param adding_method: Method to use for interpolation (see type AddingMethod for more information) :type adding_method: AddingMethod or str or int :param timestamp_position: timestamp position in the interval while downsampling :type timestamp_position: str ('BEG','MID','END') :param aggregation_method: aggregation method for downsampling :type aggregation_method: str ('MIN','MAX','MED','AVG','FIRST','LAST') :param nb_points_by_chunk: user defined number of points used for a spark chunk of data (after resampling) :type nb_points_by_chunk: int :param generate_metadata: True to generate metadata on-the-fly (ikats_start_date, ikats_end_date, qual_nb_points) :type generate_metadata: boolean (default : False) :returns: a list of dict [{'tsuid': xxx, 'funcId': yyy },...] :rtype: list of dict """ if ts_list == []: return [] fid_dict = dict() for ts in ts_list: fid_dict[ts['funcId']] = ts['tsuid'] # List of chunks of data and associated information to parallelize with Spark data_to_compute = [] # Extract tsuid list from inputs tsuid_list = [x["tsuid"] for x in ts_list] # Checking metadata availability before starting resampling meta_list = IkatsApi.md.read(tsuid_list) # Collecting information from metadata for tsuid in tsuid_list: if tsuid not in meta_list: LOGGER.error("Timeseries %s : no metadata found in base", tsuid) raise ValueError("No ikats metadata available for resampling %s" % tsuid) if 'ikats_start_date' not in meta_list[tsuid]: # Metadata not found LOGGER.error( "Metadata 'ikats_start_date' for timeseries %s not found in base", tsuid) raise ValueError("No start date available for resampling [%s]" % tsuid) if 'ikats_end_date' not in meta_list[tsuid]: # Metadata not found LOGGER.error( "meta data 'ikats_end_date' for timeseries %s not found in base", tsuid) raise ValueError("No end date available for resampling [%s]" % tsuid) if 'qual_ref_period' not in meta_list[tsuid]: # Metadata not found LOGGER.error( "Metadata qual_ref_period' for timeseries %s not found in base", tsuid) raise ValueError( "No reference period available for resampling [%s]" % tsuid) # Original timeseries information retrieved from metadata sd = int(meta_list[tsuid]['ikats_start_date']) ed = int(meta_list[tsuid]['ikats_end_date']) ref_period = int(float(meta_list[tsuid]['qual_ref_period'])) # Get the functional identifier of the original timeseries fid_origin = [x['funcId'] for x in ts_list if x['tsuid'] == tsuid][0] # Generate functional id for resulting timeseries if resampling_way == ResamplingWay.UP_SAMPLING: func_id = "%s_resampled_to_%sms_%s" % ( fid_origin, str(resampling_period), str(adding_method)) else: func_id = "%s_resampled_to_%sms_%s_%s" % ( fid_origin, str(resampling_period), timestamp_position, aggregation_method) # Creating new reference in database for new timeseries IkatsApi.ts.create_ref(func_id) # Prepare data to compute by defining intervals of final size nb_points_by_chunk # Chunk intervals computation : # Computing elementary size which is the lowest common multiple between ref period and resampling period elementary_size = _lowest_common_multiple(ref_period, resampling_period) # Seeking the number of elementary size which contains nb of points nearest to nb_points_by_chunk parameter # in order to compute the final data chunk size nb_points_for_elementary_size = int(elementary_size / resampling_period) data_chunk_size = int(nb_points_by_chunk / nb_points_for_elementary_size) * elementary_size # Limit the size of data_chunk_size if data_chunk_size < elementary_size: data_chunk_size = elementary_size # Computing intervals for chunk definition interval_limits = np.hstack((np.arange(sd, ed, data_chunk_size, dtype=np.int64), ed)) # from intervals we define chunk of data to compute # ex : intervals = [ 1, 2, 3] => 2 chunks [1, 2] and [2, 3] if len(interval_limits) > 2: # there is more than 2 limits for interval definition, i.e there is more than one chunk to compute data_to_compute.extend([(tsuid, func_id, i, interval_limits[i], interval_limits[i + 1]) for i in range(len(interval_limits) - 1)]) elif len(interval_limits) > 1: # only one chunk to compute data_to_compute.append( (tsuid, func_id, 0, interval_limits[0], interval_limits[1])) # in case last original point and last downsampled point are aligned => add a supplementary chunk to compute # last point if (interval_limits[-1] - sd) % resampling_period == 0: data_to_compute.append((tsuid, func_id, 1, interval_limits[-1], interval_limits[-1] + resampling_period)) LOGGER.info("Running resampling using Spark") # Create or get a spark Context spark_context = ScManager.get() if resampling_way == ResamplingWay.UP_SAMPLING: spark_function = _spark_upsample args = adding_method else: spark_function = _spark_downsample args = (timestamp_position, aggregation_method) try: # OUTPUT : [(TSUID_origin, func_id, chunk_index, sd_interval, ed_interval), ...] inputs = spark_context.parallelize(data_to_compute, len(data_to_compute)) # INPUT : [(TSUID_origin, func_id, chunk_index, sd_interval, ed_interval), ...] # OUTPUT : [((TSUID_origin, func_id), chunk_index, original_data_array), ...] # PROCESS : read original data in database / filter chunk with no data rdd_data_with_chunk_index = inputs \ .map(lambda x: ((x[0], x[1]), x[2], IkatsApi.ts.read(tsuid_list=x[0], sd=int(x[3]), ed=int(x[4]))[0])) \ .filter(lambda x: len(x[2]) > 0) if resampling_way == ResamplingWay.UP_SAMPLING: # INPUT : [((TSUID_origin, func_id), chunk_index, original_data_array), ...] # OUTPUT : [((TSUID_origin, func_id), original_data_array_with_inter_chunks), ...] # PROCESS : compute inter-chunks intervals / filter empty chunks rdd_data = _calc_inter_chunks(rdd=rdd_data_with_chunk_index) \ .map(lambda x: (x[0], x[2])) \ .filter(lambda x: not (len(x[1]) == 2 and (int(float(x[1][0][0])) == int(float(x[1][1][0]))))) else: # INPUT : [((TSUID_origin, func_id), chunk_index, original_data_array), ...] # OUTPUT : [((TSUID_origin, func_id), original_data_array), ...] # PROCESS : suppress useless chunk indexes rdd_data = rdd_data_with_chunk_index.map(lambda x: (x[0], x[2])) # INPUT : [((TSUID_origin, func_id), original_data_array_with_inter_chunks), ...] # OUTPUT : [((TSUID_origin, func_id), data_resampled_array), ...] # PROCESS : resample chunks of data to resampling_period rdd_resampled_data = rdd_data.map( lambda x: (x[0], spark_function(data=x[1], period=resampling_period, args=args))) \ .filter(lambda x: len(x[1]) > 0) # INPUT : [((TSUID_origin, func_id), data_resampled_array), ...] # OUTPUT : [(TSUID_origin, func_id, TSUID, sd, ed), ...] # PROCESS : create resampled data in database / compute global start and end date identifiers = rdd_resampled_data \ .map(lambda x: (x[0][0], x[0][1], _spark_import(fid=x[0][1], data=x[1], generate_metadata=generate_metadata))) \ .map(lambda x: ((x[0], x[1], x[2][0]), (x[2][1], x[2][2]))) \ .reduceByKey(lambda x, y: (min(x[0], y[0]), max(x[1], y[1]))) \ .map(lambda x: (x[0][0], x[0][1], x[0][2], x[1][0], x[1][1])) \ .collect() except Exception as err: msg = "Exception raised while resampling with Spark: %s " % err LOGGER.error(msg) raise IkatsException(msg) finally: # Stop spark Context ScManager.stop( ) # Post-processing : metadata import and return dict building # returns dict containing the results of the resampling # where key is the original TSUID and values are resampled TSUID and functional identifiers returned_dict = {} for timeseries in identifiers: tsuid_origin = timeseries[0] func_id = timeseries[1] tsuid = timeseries[2] sd = timeseries[3] ed = timeseries[4] # Import metadata in non temporal database _save_metadata(tsuid=tsuid, md_name='qual_ref_period', md_value=resampling_period, data_type=DTYPE.number, force_update=True) _save_metadata(tsuid=tsuid, md_name='ikats_start_date', md_value=sd, data_type=DTYPE.date, force_update=True) _save_metadata(tsuid=tsuid, md_name='ikats_end_date', md_value=ed, data_type=DTYPE.date, force_update=True) # Retrieve imported number of points from database qual_nb_points = IkatsApi.ts.nb_points(tsuid=tsuid) IkatsApi.md.create(tsuid=tsuid, name='qual_nb_points', value=qual_nb_points, data_type=DTYPE.number, force_update=True) # Inherit from parent IkatsApi.ts.inherit(tsuid, tsuid_origin) # Fill returned list returned_dict[tsuid_origin] = {"tsuid": tsuid, 'funcId': func_id} return returned_dict
def _apply_motif_global_same_words(self, activate_spark): """ Test - with the global method to search the neighborhood motif, - with/without spark jobs according to activate_spark - and where the words are all the same """ spark_context = ScManager.get() # Build the SAX result with large breakpoints sax_result = SaxResult(paa=spark_context.parallelize([]), breakpoints=[-300, -100, 100, 300], sax_word='abcdeabcdeabcdeabcde') sax, _, _ = sax_result.start_sax(5, spark_ctx=spark_context) # sax is an rdd -> to np.array sax = np.transpose(sax.collect()) breakpoint = sax_result.build_mindist_lookup_table(alphabet_size=5) # Build the collision matrix result collision_matrix = SparseMatrix( np.array([[ 0, 0, 0, 0, ], [ 100, 0, 0, 0, ], [ 100, 100, 0, 0, ], [ 100, 100, 100, 0, ]])) # two identical cases here: brute force / with collisions for method_opt in [OPT_USING_BRUTE_FORCE, OPT_USING_COLLISIONS]: # mindist distances: # # [[ 0. 0. 0. 0.] # [ 0. 0. 0. 0.] # [ 0. 0. 0. 0.] # [ 0. 0. 0. 0.]] # Build the class for motif search search_info = NeighborhoodSearch(size_sequence=20, mindist_lookup_table=breakpoint, alphabet_size=5, sax=np.transpose(sax), radius=0.01, collision_matrix=collision_matrix) recognition_info = ConfigRecognition( is_stopped_by_eq9=True, iterations=0, min_value=1, is_algo_method_global=True, activate_spark=activate_spark, radius=0.01, neighborhood_method=method_opt) # neighborhood_method=OPT_USING_BRUTE_FORCE (compare with all the words) result = search_info.motif_neighborhood_global( 30, recognition_info) self._print_mindist_mat(search_info) # The words corresponding to the six largest values cells have a MINDIST < radius self.assertEqual(len(result), 1) # This results are the same : [0,1,2,3]: the 6 groups have been reduced to one inside self.assertEqual(result, [[0, 1, 2, 3]])
def dataset_cut_spark(tsuid_list, start, end, nb_points, nb_points_by_chunk, generate_metadata, meta_list): """ Cutting dataset algorithm, using spark :param tsuid_list: list of tsuid :param start: start cut date :param end: end cut date :param nb_points: number of points to cut :param nb_points_by_chunk: number of points per chunk :param generate_metadata: True to generate metadata on-the-fly (ikats_start_date, ikats_end_date, qual_nb_points) (default: False) :param meta_list: dict of metadata (tsuid is the key) :type tsuid_list: list :type start: int :type end: int or None :type nb_points: int or None :type generate_metadata: boolean :param meta_list: dict :return: list of dict {"tsuid": tsuid, "funcId": func_id} :rtype: list of dict :raise ValueError: if inputs are not filled properly (see called methods description) """ # List of chunks of data and associated information to parallelize with Spark data_to_compute = [] # Collecting information from metadata for tsuid in tsuid_list: if tsuid not in meta_list: LOGGER.error("Time series %s: no metadata found in base", tsuid) raise ValueError("No ikats metadata available for cutting %s" % tsuid) if 'ikats_start_date' not in meta_list[tsuid]: # Metadata not found LOGGER.error("Metadata 'ikats_start_date' for time series %s not found in base", tsuid) raise ValueError("No start date available for cutting [%s]" % tsuid) if 'ikats_end_date' not in meta_list[tsuid]: # Metadata not found LOGGER.error("Metadata 'ikats_end_date' for time series %s not found in base", tsuid) raise ValueError("No end date available for cutting [%s]" % tsuid) if 'qual_ref_period' not in meta_list[tsuid]: # Metadata not found LOGGER.error("Metadata 'qual_ref_period' for time series %s not found in base", tsuid) raise ValueError("No reference period available for cutting [%s]" % tsuid) # Original time series information retrieved from metadata sd = int(meta_list[tsuid]['ikats_start_date']) ed = int(meta_list[tsuid]['ikats_end_date']) ref_period = int(float(meta_list[tsuid]['qual_ref_period'])) # Get the functional identifier of the original time series fid_origin = IkatsApi.ts.fid(tsuid) # Generate functional id for resulting time series func_id = "%s_cut_%d" % (fid_origin, time.time() * 1e6) # Creating new reference in database for new time series IkatsApi.ts.create_ref(func_id) # Prepare data to compute by defining intervals of final size nb_points_by_chunk # Chunk intervals computation: data_chunk_size = int(nb_points_by_chunk * ref_period) # Computing intervals for chunk definition interval_limits = np.hstack(np.arange(sd, ed, data_chunk_size, dtype=np.int64)) # from intervals we define chunk of data to compute: # # 1. defining chunks excluding last point of data within every chunk # ex: intervals = [ 10, 20, 30, 40 ] => 2 chunks [10, 19] and [20, 29] (last chunk added in step 2) data_to_compute.extend([(tsuid, func_id, i, interval_limits[i], interval_limits[i + 1] - 1) for i in range(len(interval_limits) - 1)]) # 2. adding last interval, including last point of data # ex: [30, 40] data_to_compute.append((tsuid, func_id, len(interval_limits) - 1, interval_limits[-1], ed + 1)) LOGGER.info("Running dataset cut using Spark") # Create or get a spark Context spark_context = ScManager.get() try: # OUTPUT: [(TSUID_origin, func_id, chunk_index, sd_interval, ed_interval), ...] inputs = spark_context.parallelize(data_to_compute, len(data_to_compute)) # INPUT: [(TSUID_origin, func_id, chunk_index, sd_interval, ed_interval), ...] # OUTPUT: [((TSUID_origin, func_id), chunk_index, original_data_array), ...] # PROCESS: read original data in database / filter chunk with no data rdd_data = inputs \ .map(lambda x: ((x[0], x[1]), x[2], IkatsApi.ts.read(tsuid_list=x[0], sd=int(x[3]), ed=int(x[4]))[0])) \ .filter(lambda x: len(x[2]) > 0) # INPUT: [((TSUID_origin, func_id), chunk_index, original_data_array), ...] # OUTPUT: [((TSUID_origin, func_id), chunk_index, (nb_points, data_cut_array)), ...] # PROCESS: cut chunks of data, filter empty results rdd_cut_chunk_data = rdd_data \ .map(lambda x: (x[0], x[1], _spark_cut(data=x[2], min_date=start, max_date=end))) \ .filter(lambda x: len(x[2][1]) > 0) \ .cache() # no end cutting date provided => case of cutting a given number of points if end is None: # INPUT: [((TSUID_origin, func_id), chunk_index, (nb_points, data_cut_array)), ...] # OUTPUT: [((TSUID_origin, func_id), [(chunk_index1, nb_points1), (chunk_index2, nb_points2),...], ...] # PROCESS: Collect nb points associated to chunk indexes ts_pts_by_chunk = rdd_cut_chunk_data.map(lambda x: (x[0], (x[1], x[2][0]))) \ .groupByKey().map(lambda x: (x[0], list(x[1]))) \ .collect() # Compute for each ts from collected data: # - last chunk index containing points to keep # - the number of points to keep in this last chunk # cut_info: {(TSUID_origin1, func_id1):(last_chunk_index1, nb_points1), # (TSUID_origin2, func_id2):(last_chunk_index2, nb_points2), ...} cut_info = {} for ts in ts_pts_by_chunk: nb_cumul = 0 for chunk_index, points in ts[1]: nb_cumul += points # noinspection PyTypeChecker if nb_cumul > nb_points: # noinspection PyTypeChecker cut_info[ts[0]] = (chunk_index, points - (nb_cumul - nb_points)) break else: LOGGER.warning( "Number of points cut with start cutting date provided exceeds time series %s size" % IkatsApi.ts.fid(ts[0][0])) # case nb_points > nb points of the time series # noinspection PyTypeChecker cut_info[ts[0]] = (chunk_index, points) # INPUT: [((TSUID_origin, func_id), chunk_index, (nb_points, data_cut_array)), ...] # OUTPUT: [((TSUID_origin, func_id), data_cut_array), ...] rdd_cut_data = rdd_cut_chunk_data.filter(lambda x: x[1] <= cut_info[x[0]][0]) \ .map(lambda x: (x[0], x[2][1][:cut_info[x[0]][1]] if x[1] == cut_info[x[0]][0] else x[2][1])) else: # INPUT: [((TSUID_origin, func_id), chunk_index, (nb_points, data_cut_array)), ...] # OUTPUT: [((TSUID_origin, func_id), data_cut_array), ...] rdd_cut_data = rdd_cut_chunk_data.map(lambda x: (x[0], x[2][1])) # INPUT: [((TSUID_origin, func_id), data_cut_array), ...] # OUTPUT: [(TSUID_origin, func_id, TSUID, sd, ed), ...] # PROCESS: create cut data in database / compute global start and end date identifiers = rdd_cut_data \ .map(lambda x: (x[0][0], x[0][1], _spark_import(fid=x[0][1], data=x[1], generate_metadata=generate_metadata))) \ .map(lambda x: ((x[0], x[1], x[2][0]), (x[2][1], x[2][2]))) \ .reduceByKey(lambda x, y: (min(x[0], y[0]), max(x[1], y[1]))) \ .map(lambda x: (x[0][0], x[0][1], x[0][2], x[1][0], x[1][1])) \ .collect() except Exception as err: msg = "Exception raised while cutting with Spark: %s " % err LOGGER.error(msg) raise IkatsException(msg) finally: # Stop spark Context ScManager.stop() # Post-processing: metadata import and return dict building # Returns list of dict containing the results of the cut time series: TSUID and functional identifiers results = [] for timeseries in identifiers: tsuid_origin = timeseries[0] func_id = timeseries[1] tsuid = timeseries[2] sd = timeseries[3] ed = timeseries[4] # Import metadata in non temporal database _save_metadata(tsuid=tsuid, md_name='ikats_start_date', md_value=sd, data_type=DTYPE.date, force_update=True) _save_metadata(tsuid=tsuid, md_name='ikats_end_date', md_value=ed, data_type=DTYPE.date, force_update=True) # Retrieve imported number of points from database qual_nb_points = IkatsApi.ts.nb_points(tsuid=tsuid) IkatsApi.md.create(tsuid=tsuid, name='qual_nb_points', value=qual_nb_points, data_type=DTYPE.number, force_update=True) # Inherit from parent IkatsApi.ts.inherit(tsuid, tsuid_origin) # Fill returned list results.append({"tsuid": tsuid, "funcId": func_id}) return results
def cut_y(original_ts_list, criterion, fid_pattern="{fid}_cutY{compl}", chunk_size=75000): """ Algorithm Cut-Y Cut among Y-axis (values) a list of timeseries matching a criterion defined as a python expression. Matching and non-matching values are separated into 2 timeseries This algorithm uses spark From the TS list provided (used as reference), extract 2 TS list: * The first one matching the value condition * The second one not matching the value condition :param original_ts_list: List of TSUID/funcID to use for filtering: [{tsuid:xxx, funcId:xxx}, ...] :param criterion: python expression used to define a matching pattern :param fid_pattern: pattern used to name the FID of the output TSUID. {fid} will be replaced by the FID of the original TSUID FID {M} will be replaced by the original TSUID metric name {compl} will be replaced by "" or "_compl" depending on the output type (matching/not matching). :param chunk_size: the number of points per chunk :type original_ts_list: list :type criterion: str :type fid_pattern: str :type chunk_size: int :return: 2 lists representing the "matching" and "not matching" list of TS corresponding to the input :rtype: list :raises ValueError: if ts_list is badly formatted :raises TypeError: if ts_list is not a list """ # Check input validity if type(original_ts_list) is not list: raise TypeError("ts_list shall be a list") if len(original_ts_list) == 0: raise ValueError("ts_list shall have at least one element") for _, item in enumerate(original_ts_list): if "tsuid" not in item or "funcId" not in item: raise ValueError("ts_list shall have tsuid and funcId defined") # Get all the metadata md_list = IkatsApi.md.read(ts_list=[x['tsuid'] for x in original_ts_list]) # Prepare the spark items to parallelize # Create and build the data that will be used in spark transformations ts_list_with_new_fid, fid2tsuid = _prepare_spark_data(fid_pattern=fid_pattern, md_list=md_list, ts_list=original_ts_list) # Chunks computation ts_info = [] for ts_data in ts_list_with_new_fid: # Get the chunks raw information chunks = SparkUtils.get_chunks(tsuid=ts_data[0], md_list=md_list, chunk_size=chunk_size) # Build a new list containing only used information for chunk in chunks: ts_info.append({ "tsuid": ts_data[0], "start_date": chunk[1], "end_date": chunk[2], "matching_fid": ts_data[1], "not_matching_fid": ts_data[2], "matching_tsuid": fid2tsuid[ts_data[1]], "not_matching_tsuid": fid2tsuid[ts_data[2]] }) # Get Spark Context # Important !!!! Use only this method in Ikats to use a spark context spark_context = ScManager.get() try: # Prepare the lambda expression. Value is replaced by "Y" variable name lambda_criterion = eval("lambda Y : " + criterion) # OUTPUT : [{ # tsuid:x, # start_date:x, # end_date:x, # matching_fid:x, # not_matching_fid:x, # matching_tsuid:x, # not_matching_tsuid:x # }, ...] # PROCESS : Parallelize TS chunks information rdd_ts_list = spark_context.parallelize(ts_info, max(8, len(ts_info))) # INPUT : [{ # tsuid:x, # start_date:x, # end_date:x, # matching_fid:x, # not_matching_fid:x, # matching_tsuid:x, # not_matching_tsuid:x # }, ...] # OUTPUT : [({ # start_date: "date of the first point matching the criterion in the current chunk" # end_date: "date of the last point matching the criterion in the current chunk" # numberOfSuccess: "number of points matching the criterion in the current chunk" # tsuid: "TSUID of the matching part" # }, # { # start_date: "date of the first point not matching the criterion in the current chunk" # end_date: "date of the last point not matching the criterion in the current chunk" # numberOfSuccess: "number of points not matching the criterion in the current chunk" # tsuid: "TSUID of the non-matching part" # }), ...] # PROCESS : Separate points matching and not-matching the criterion in every chunk. Fill the corresponding TS rdd_imported = rdd_ts_list.map(lambda x: _spark_cut_y_chunk( tsuid=x['tsuid'], start_date=x['start_date'], end_date=x['end_date'], match_criterion=lambda_criterion, result_info={ "matching_fid": x['matching_fid'], "not_matching_fid": x['not_matching_fid'], "matching_tsuid": x['matching_tsuid'], "not_matching_tsuid": x['not_matching_tsuid'] })) # INPUT : [({ # start_date: "date of the first point matching the criterion in the current chunk" # end_date: "date of the last point matching the criterion in the current chunk" # numberOfSuccess: "number of points matching the criterion in the current chunk" # tsuid: "TSUID of the matching part" # }, # { # start_date: "date of the first point not matching the criterion in the current chunk" # end_date: "date of the last point not matching the criterion in the current chunk" # numberOfSuccess: "number of points not matching the criterion in the current chunk" # tsuid: "TSUID of the non-matching part" # }), ...] # OUTPUT : [(TSUID, nb_points, start_date, end_date), ...] # PROCESS : Flat the results and simplify the format to allow quick actions on every item rdd_metadata_prep = rdd_imported \ .flatMap(lambda x: x) \ .filter(lambda x: x is not None) \ .map(lambda x: (x['tsuid'], x['numberOfSuccess'], x['start_date'], x['end_date'])) # Delete empty TSUID deleted_tsuid = rdd_metadata_prep \ .map(lambda x: (x[0], x[1])) \ .reduceByKey(lambda x, y: x + y) \ .filter(lambda x: x[1] == 0) \ .map(lambda x: (x[0], IkatsApi.ts.delete(tsuid=x[0]))) \ .map(lambda x: x[0]) \ .collect() # This RDD is reused in several branches. Caching it improves the performances rdd_metadata_prep.cache() # Create metadata qual_nb_points rdd_metadata_prep \ .map(lambda x: (x[0], x[1])) \ .reduceByKey(lambda x, y: x + y) \ .filter(lambda x: x[1] > 0) \ .foreach(lambda x: IkatsApi.md.create(tsuid=x[0], name="qual_nb_points", value=x[1])) # Create metadata ikats_start_date rdd_metadata_prep \ .map(lambda x: (x[0], x[2])) \ .filter(lambda x: x[1] is not None) \ .reduceByKey(lambda x, y: min(x, y)) \ .foreach(lambda x: IkatsApi.md.create(tsuid=x[0], name="ikats_start_date", value=x[1])) # Create metadata ikats_end_date rdd_metadata_prep \ .map(lambda x: (x[0], x[3])) \ .filter(lambda x: x[1] is not None) \ .reduceByKey(lambda x, y: max(x, y)) \ .foreach(lambda x: IkatsApi.md.create(tsuid=x[0], name="ikats_end_date", value=x[1])) # Unpersist the RDD because not used anymore rdd_metadata_prep.unpersist() finally: ScManager.stop() # Inherit properties for item in ts_list_with_new_fid: if fid2tsuid[item[1]] not in deleted_tsuid: IkatsApi.ts.inherit(tsuid=fid2tsuid[item[1]], parent=item[0]) if fid2tsuid[item[2]] not in deleted_tsuid: IkatsApi.ts.inherit(tsuid=fid2tsuid[item[2]], parent=item[0]) # Format and sort the results # First output contains the matched data points TS reference # Second output contains the not matched (complement) points TS reference return (_format_output(deleted_tsuid=deleted_tsuid, fid2tsuid=fid2tsuid, ts_list_with_new_fid=ts_list_with_new_fid, index=1), _format_output(deleted_tsuid=deleted_tsuid, fid2tsuid=fid2tsuid, ts_list_with_new_fid=ts_list_with_new_fid, index=2))
def _apply_motif_global_coll_ex1(self, activate_spark): """ Test - with the global method to search the neighborhood motif, - with/without spark according to activate_spark - exploring similarities with collisions heuristic - with input: the words have only one different letter. And every sequence Si has collisions with Sj with that matrix. Note: results ought to be equal to test_global_brute_no_spark_ex1 """ # Build the SAX result where the words have only one different letter (words: 5 letters) sequences = ["abcde", "abcdd", "abcdc", "abcdb", "abcda"] tested_sax_word = ''.join(sequences) spark_context = ScManager.get() sax_result = SaxResult(paa=spark_context.parallelize([]), breakpoints=[-1.1, -1, 0, 1.501], sax_word=tested_sax_word) sax, _, nb_seq = sax_result.start_sax(5, spark_ctx=spark_context) # sax is an rdd -> to np.array sax = np.transpose(sax.collect()) breakpoint = sax_result.build_mindist_lookup_table(5) # Build a collision matrix (the real collision matrix is different, but we take this one for the test) collision_matrix = SparseMatrix( np.array([[ 0, 0, 0, 0, 0, ], [ 30, 0, 0, 0, 0, ], [ 2, 40, 0, 0, 0, ], [ 4, 8, 50, 0, 0, ], [ 6, 10, 20, 60, 0, ]])) self._print_matrix("test_global_coll_no_spark_ex1", collision_matrix.data, nb_seq) # mindist distances: # [[ 0. 0. 3.002 5.002 5.202] # [ 0. 0. 0. 2. 2.2 ] # [ 3.002 0. 0. 0. 0.2 ] # [ 5.002 2. 0. 0. 0. ] # [ 5.202 2.2 0.2 0. 0. ]] # Using neighborhood_method=OPT_USING_COLLISIONS # # for collisions (0,1) (1,2) (2,3) (3,4) greater than min_value==25 # and with the collisions heuristic: only sequences having collisions with Si or Sj are examined # # for radius 1.9 => global result is [[0, 1, 2], [0, 1, 2, 3, 4], [1, 2, 3, 4], [2, 3, 4]] # # for radius 2.5 => global result is [[0, 1, 2, 3, 4], [0, 1, 2, 3, 4], [1, 2, 3, 4], [1, 2, 3, 4]] # => reduced to [[[0, 1, 2, 3, 4], [1, 2, 3, 4]] # # for radius 3.5 => global result is [[0, 1, 2, 3, 4], [0, 1, 2, 3, 4], [0, 1, 2, 3, 4], [1, 2, 3, 4]] # => reduced to [[0, 1, 2, 3, 4], [1, 2, 3, 4]] # # for radius 6 => global result is [[0, 1, 2, 3, 4], [0, 1, 2, 3, 4], [0, 1, 2, 3, 4], [0, 1, 2, 3, 4]] # => reduced to [[0, 1, 2, 3, 4]] # for radius, expected_res in [[2.5, [[0, 1, 2, 3, 4], [1, 2, 3, 4]]], [ 1.9, [[0, 1, 2], [0, 1, 2, 3, 4], [1, 2, 3, 4], [2, 3, 4]] ], [3.5, [[0, 1, 2, 3, 4], [1, 2, 3, 4]]], [6, [[0, 1, 2, 3, 4]]]]: # Build the class for motif search where the min_value is 25 search_info = NeighborhoodSearch(size_sequence=20, mindist_lookup_table=breakpoint, alphabet_size=5, sax=np.transpose(sax), radius=radius, collision_matrix=collision_matrix) # for info: here is the mindist: # (see _print_mindist_mat doc: in order to activate print) self._print_mindist_mat(search_info) recognition_info = ConfigRecognition( is_stopped_by_eq9=True, iterations=0, min_value=25, is_algo_method_global=True, activate_spark=activate_spark, radius=radius, neighborhood_method=OPT_USING_COLLISIONS) print("radius {}:expected: {}".format( radius, expected_res)) result = search_info.motif_neighborhood_global( recognition_info.min_value, recognition_info) print("radius {}:->global with collisions: {}".format( radius, result)) self.assertEqual(len(result), len(expected_res)) for group in result: self.assertTrue(group in expected_res)
def run(self, tsuids): """ Run the Spark Distance calculation create the RDD for each tsuid, load the TS from tdm in a broadcast dictionary (ie shared by all workers) map the RDD with cartesian product ( ie get RDD1,RDD1 RDD1,RDD2 RDD2,RDD1 RDD2,RDD2 with 2 RDD) to get the comparison couples. then reduce applying distance function and add the result into Accumulator (ie shared by all workers) distance function take the two TS from broadcast dictionary, shrink the biggest and apply euclidean Usage: pi [tsuid1] [tsuid2] ...[tsuidn] example : tsuids = ['0000110000030003F30000040003F1', '0000110000030003F40000040003F1', '0000110000030003F50000040003F1', '0000110000030003F60000040003F1', '0000110000030003F70000040003F1'] :param tsuids: a list of tsuids (str) :type tsuids: list """ # creation of the RDD rdd = self.spark_context.parallelize(tsuids) self.logger.info("rdd parallelized") self.logger.info("loading TS") start_time = time.time() j = len(tsuids) // self.ts_load_split_size self.logger.debug(type(tsuids)) self.logger.info("Number of TS: %i ", len(tsuids)) ts = list() for i in range(0, j + 1): k = (i + 1) * self.ts_load_split_size if k > len(tsuids): k = len(tsuids) self.logger.info("extract TS from index %i to %i ", i * self.ts_load_split_size, k) ts.extend(self.tdm.get_ts(tsuids[i * self.ts_load_split_size:k])) ts_dic = dict() self.logger.info("Number of TS loaded : %i ", len(ts)) for index in range(0, len(tsuids)): ts_dic[tsuids[index]] = ts[index] # broadcast var used to get the map result broadcast_var = self.spark_context.broadcast(ts_dic) loading_end_time = time.time() self.logger.info("Loading Time : %s ", loading_end_time - start_time) # create the result accumulator list_accum = self.spark_context.accumulator(dict(), ListAccumulatorParam()) def calculate_distance(tsuid_list): """ :param tsuid_list: a pair of tsuids :type tsuid_list: list """ # use py4j logger to avoid Serialization problems. logger = logging.getLogger('py4j') logger.setLevel(logging.INFO) logger.removeHandler(logger.handlers[0]) # sh = logging.StreamHandler(sys.stdout) stream_handler = logging.StreamHandler() stream_handler.setLevel(logging.INFO) formatter = logging.Formatter( '%(asctime)s:%(levelname)s:%(funcName)s:%(message)s') stream_handler.setFormatter(formatter) logger.addHandler(stream_handler) # start distance calculus. logger.debug("tsuid1= %s", tsuid_list[0]) logger.debug("tsuid2= %s", tsuid_list[1]) if tsuid_list[0] != tsuid_list[1]: first_ts = np.array(broadcast_var.value[tsuid_list[0]][:, 1]) second_ts = np.array(broadcast_var.value[tsuid_list[1]][:, 1]) calculus_len = min(len(first_ts), len(second_ts)) distance = euclidean(first_ts[0:calculus_len], second_ts[0:calculus_len]) # logger.debug("tsuid list %s and distance %f" % (tsuid_list, distance)) list_accum.add({tsuid_list: distance}) __import__('ikats.algo.core.distance') rdd.cartesian(rdd).foreach(calculate_distance) ScManager.stop() computation_end_time = time.time() self.logger.info("Loading Time : %s ", loading_end_time - start_time) self.logger.info("Compute Time : %s ", computation_end_time - loading_end_time) return list_accum.value
def _apply_iter_coll_no_spark_ex1(self, activate_spark): """ Tests motif_neighborhood_iterative() - the iterative method - using the heuristic based upon collisions - to search the neighborhood motif Note: test where the words have only one different letter. """ # Build the SAX result where the words have only one different letter (words: 5 letters) sequences = ["abcde", "abcdd", "abcdc", "abcdb", "abcda"] tested_sax_word = ''.join(sequences) spark_context = ScManager.get() sax_result = SaxResult(paa=spark_context.parallelize([]), breakpoints=[-1.1, -1, 0, 1.501], sax_word=tested_sax_word) sax, _, nb_seq = sax_result.start_sax(5, spark_ctx=spark_context) # sax is an rdd -> to np.array sax = np.transpose(sax.collect()) breakpoint = sax_result.build_mindist_lookup_table(5) # Build a collision matrix # Note: this matrix is different from the one from # test test_iterative__brute_no_spark_ex1: # => see zeros are added: coll(3,2) == coll(4,2) == 0 collision_matrix = SparseMatrix( np.array([[ 0, 0, 0, 0, 0, ], [ 40, 0, 0, 0, 0, ], [ 2, 40, 0, 0, 0, ], [ 4, 8, 0, 0, 0, ], [ 6, 10, 0, 50, 0, ]])) self._print_matrix("test_iterative__brute_no_spark_ex1", collision_matrix.data, nb_seq) # mindist distances: # [[ 0. 0. 3.002 5.002 5.202] # [ 0. 0. 0. 2. 2.2 ] # [ 3.002 0. 0. 0. 0.2 ] # [ 5.002 2. 0. 0. 0. ] # [ 5.202 2.2 0.2 0. 0. ]] # Using neighborhood_method=OPT_USING_BRUTE_FORCE # # iterative: examining collisions (i,j) per iteration: # (3,4) then (1,2) +(0,1) # # (collisions greater than min_value==25) # # Test with fixed radius 1.9: # - iter=1 => result is [[3, 4]] considering (S3,S4) neighborhood # - iter=2 => result extended with [0,1,2] considering (S0,S1), unchanged for (S1,S2) # - iter=3 => result is the same than for iter=2: no more collision available # - iter=100 => result is the same than for iter=2: no more collision available # for radius, nb_iter, expected_res in [[1.9, 1, [[3, 4]]], [1.9, 2, [[3, 4], [0, 1, 2]]], [1.9, 3, [[3, 4], [0, 1, 2]]], [1.9, 100, [[3, 4], [0, 1, 2]]]]: # Build the class for motif search where the min_value is 25 search_info = NeighborhoodSearch(size_sequence=20, mindist_lookup_table=breakpoint, alphabet_size=5, sax=np.transpose(sax), radius=radius, collision_matrix=collision_matrix) # for info: here is the mindist: # (see _print_mindist_mat doc: in order to activate print) self._print_mindist_mat(search_info) recognition_info = ConfigRecognition( is_stopped_by_eq9=True, iterations=nb_iter, min_value=25, is_algo_method_global=False, activate_spark=activate_spark, radius=radius, neighborhood_method=OPT_USING_COLLISIONS) result = search_info.motif_neighborhood_iterative( recognition_info.min_value, recognition_info) self.assertEqual(len(result), len(expected_res)) for group in result: self.assertTrue(group in expected_res)
def main_test(): """ Functional test entry point """ logger = logging.getLogger("ikats.algo.core.correlation") # Log format logger.setLevel(logging.DEBUG) formatter = logging.Formatter( '%(asctime)s:%(levelname)s:%(funcName)s:%(message)s') # Create another handler that will redirect log entries to STDOUT stream_handler = logging.StreamHandler() stream_handler.setLevel(logging.DEBUG) stream_handler.setFormatter(formatter) logger.addHandler(stream_handler) if os.getenv("PYSPARK_PYTHON") is None: os.putenv("PYSPARK_PYTHON", "/home/ikats/tools/ikats_processing/bin/python") if os.getenv("SPARK_HOME") is None: os.putenv("SPARK_HOME", "/opt/spark") print('Loading Spark Context') # Get a spark Context ScManager.get() tdm = TemporalDataMgr() answer = 'n' tsuid_list = [] ds_name = '' while answer.lower() != 'y': ds_name = input('\nEnter dataset Name: ') tsuid_list = tdm.get_data_set(ds_name)['ts_list'] print("%s TS found in dataset %s" % (len(tsuid_list), ds_name)) if len(tsuid_list) > 0: answer = input( "Run the correlation matrix on these dataset? [Y/n] ") print('Running correlation matrix on %s TS' % len(tsuid_list)) start_time = time.time() sp_corr = SparkCorrelation(tdm) sp_corr.force_parallel_get_ts = True sp_corr.run(tsuid_list) print( "EXECUTION TIME (for %d TS with %d pts/ea = %d points): %.3f seconds" % (len(tsuid_list), sp_corr.ts_len_ref, (len(tsuid_list) * sp_corr.ts_len_ref), (time.time() - start_time))) if os.path.isfile('/tmp/spark_correlation_result_%s.csv' % ds_name): os.remove('/tmp/spark_correlation_result_%s.csv' % ds_name) with open('/tmp/spark_correlation_result_%s.csv' % ds_name, 'w', newline='') as opened_file: opened_file.write(sp_corr.get_csv()) print("Matrix in CSV format is saved at the following location:") print(" /tmp/spark_correlation_result_%s.csv" % ds_name) print("You can check the content by doing :") print(" cat /tmp/spark_correlation_result_%s.csv" % ds_name) print(" less /tmp/spark_correlation_result_%s.csv" % ds_name) print(" vi /tmp/spark_correlation_result_%s.csv" % ds_name)
def random_projections(ts_list, sax_info, collision_info, recognition_info): """ The Random Projections Algorithm ================================ This algorithm does the following (detailed for 1 TS but valid for many TS): * Apply the sliding window * Normalize the TS (global or/and local) * Filter the linear sequences (optional) and trivial matches * Apply the SAX algorithm * Build the collision matrix * Find the largest value cells in the collision matrix * Search the motif neighborhood ..note:: The algorithm can produce "paa values" (numeric) for each sequence. The problem is the huge length of the results. **Catalogue implementation is provided**: main_random_projections() is calling random_projections() once all configurations ConfigSAX, ConfigCollision, ConfigRecognition are initialized. :param ts_list: list of TSUID :type ts_list: list :param sax_info: the information to make the sliding window and the sax_algorithm :type sax_info: ConfigSax :param collision_info: the information to build the collision matrix :type collision_info: ConfigCollision :param recognition_info: the information to made the pattern _recognition :type recognition_info: ConfigRecognition :return: the list of similar sequences, the sax result, the equation 9 result, and the sequences list :type: list, str, float, list """ LOGGER.info("Configurations deduced from user parameters:") LOGGER.info("- sliding sax nb paa=%s", sax_info.paa) LOGGER.info("- sliding sax alphabet size=%s", sax_info.alphabet_size) LOGGER.info("- sliding sax sequences_size=%s", sax_info.sequences_size) LOGGER.info("- collision nb indexes=%s", collision_info.index) LOGGER.info("- collision nb iterations=%s", collision_info.nb_iterations) LOGGER.info("- collision accepted errors=%s", collision_info.errors) LOGGER.info("- recognition min_value=%s", recognition_info.min_value) LOGGER.info("- recognition iterations=%s", recognition_info.iterations) LOGGER.info("- recognition similarity radius=%s", recognition_info.radius) # Create or get a spark Context LOGGER.info("Running using Spark") spark_ctx = ScManager.get() # INPUT : all the TS { "ts_name" : [[time1, value1],...], "ts_name2": ... } # OUTPUT : rdd_sequences_list = [ (key, sequence), ... ] # rdd_normalization_coefficients = [ (same_key,(un-normalized seq_mean, un-normalized seq_sd)), ...] # PROCESS : *sliding_windows* create sequences for each TS (results are RDDs) rdd_sequences_list, rdd_normalization_coefficients = sliding_windows(ts_list=ts_list, sax_info=sax_info, spark_ctx=spark_ctx, trivial_radius=recognition_info.radius / 2) # INPUT : rdd_sequences_list = [ (key, sequence), ... ] # OUTPUT : rdd_sax_result is a SaxResult object containing # * paa (rdd of flatMap) : rdd of large list of all the paa_values concatenated # * breakpoints (list) : list of the breakpoints (len = sax_info.alphabet_size - 1) # * sax_word (large str): large string of all the SAX words concatenated # PROCESS : Give the SAX form of the sequences rdd_sax_result = run_sax_on_sequences(rdd_sequences_data=rdd_sequences_list, paa=sax_info.paa, alphabet_size=sax_info.alphabet_size) # INPUT : rdd_sequences_list = [ (key, sequence), ... ] # OUTPUT : sequences_list = { key: sequence, ...} NOT AN RDD! # PROCESS : transform rdd_sequences_list elements into dict sequences_list = rdd_sequences_list.collectAsMap() # INPUT : rdd_normalization_coefficients = [ (same_key,(un-normalized seq_mean, un-normalized seq_sd)), ...] # OUTPUT : sequences_list = { key: (un-normalized seq_mean, un-normalized seq_sd), ...} NOT AN RDD! # PROCESS : transform rdd_normalization_coefficients elements into dict normalization_coefficients = rdd_normalization_coefficients.collectAsMap() # Keep only necessary information of each sequence sequences_list = sequences_info(sequences_list, normalization_coefficients) # *paa_sequence* is a "conversion" of *sax* from letters to numbers (matrix with same shape) # (usefull for past-processing the random projection algorithm). breakpoints = [str(i) for i in rdd_sax_result.breakpoints] # Build the table which give the distance between two letters (need just sax_result.breakpoints) mindist_lookup_table = rdd_sax_result.build_mindist_lookup_table(sax_info.alphabet_size) # Give the SAX result in a array (need rdd_sax_result.sax_word and sax_result.paa) rdd_sax, paa_result, number_of_sequences = rdd_sax_result.start_sax(sax_info.paa, spark_ctx=spark_ctx) LOGGER.info("- filtered number of words=%s", number_of_sequences) if number_of_sequences == 1: LOGGER.info("- sliding window find just one sequence, no collision matrix computed.") collision_matrix = SparseMatrix(np.array([[0]])) else: # Build the collision matrix, the number of iteration can change # (if the len of a sequence is too small for example nb_iteration can be < nb_iteration specified) collision_matrix, collision_info.nb_iterations = final_collision_matrix( sax=rdd_sax, number_of_iterations=collision_info.nb_iterations, index_selected=collision_info.index, word_len=sax_info.paa, spark_ctx=spark_ctx) # *collision_matrix* is a sparse matrix : light in memory # Give the result of the Equation 9 eq9_result = equation9(number_of_sequences=number_of_sequences, size_alphabet=sax_info.alphabet_size, size_word=sax_info.paa, errors=collision_info.errors, index_selected=collision_info.index, iterations=collision_info.nb_iterations) sax = rdd_sax.collect() paa_result = np.transpose(paa_result) distance_info = NeighborhoodSearch(size_sequence=sax_info.sequences_size, mindist_lookup_table=mindist_lookup_table, alphabet_size=sax_info.alphabet_size, sax=sax, radius=recognition_info.radius, collision_matrix=collision_matrix) LOGGER.info("- theoretical Eq9 limit: min collisions = %s for accepted errors=%s", eq9_result, collision_info.errors) # Check the eq9_result with min_value if eq9_result < recognition_info.min_value: LOGGER.warning("- setting Eq9 limit to min_value=%s: because Eq9 < min_value", recognition_info.min_value) eq9_result = recognition_info.min_value if eq9_result < 1: LOGGER.warning("- setting Eq9 limit to 1: because Eq9 < 1") eq9_result = 1 # find the motif neighborhood by using the largest value cells in the collision matrix if recognition_info.is_algo_method_global is True: algo_result = distance_info.motif_neighborhood_global(eq9_result, recognition_info) else: algo_result = distance_info.motif_neighborhood_iterative(eq9_result, recognition_info) # Give the results with the names of sequences and not their number in the collision matrix algo_result = result_on_sequences_form(algo_result, sequences_list, sax, sax_info.alphabet_size, paa_result) algo_result = result_on_pattern_form(algo_result) # Give the alphabet used in the SAX algorithm alphabet = start_alphabet(sax_info.alphabet_size) result = {'patterns': algo_result, 'break_points': breakpoints, 'disc_break_points': alphabet} if spark_ctx is not None: ScManager.stop() LOGGER.info("Ended Spark session.") return result
def calc_quality_stats(ts_list, compute_value=True, compute_time=True, chunk_size=75000, force_save=True): """ Compute the quality statistics Returns a dict as follow { "TSUIDx" : { "MetadataX": ValueX, ... }, ... } Don't override default chunk_size unless you know what you are doing. It defines the number of points in a single chunk (assuming th TS is periodic) Use it only for performances purposes :param ts_list: List of TSUID to work onto :type ts_list: list :param compute_value: boolean indicating to compute metadata related to value :type compute_value: bool :param compute_time: boolean indicating to compute metadata related to time :type compute_time: bool :param chunk_size: (Advanced usage) Override the chunk size :type chunk_size: int :param force_save: Save metadata even if already present (default True) :type force_save: bool :return: Tuple composed of the input ts list and a dict having TSUID as key and a value being sub-dict where key is metadata name :rtype: tuple dict """ if not compute_value and not compute_time: LOGGER.error("You shall compute at least one set of metadata.") raise ValueError("You shall compute at least one set of metadata") try: # Convert tsuid_list [{tsuid:x, fid:x},...] to tsuid_list [tsuid,...] tsuid_list = [x['tsuid'] for x in ts_list] except TypeError: # Already a tsuid_list. No change tsuid_list = ts_list LOGGER.info('Computing Quality stats for %s TS', len(tsuid_list)) # Get all metadata md_list = IkatsApi.md.read(ts_list=tsuid_list) # Initialize results results = {} for tsuid in tsuid_list: results[tsuid] = {} try: # Get Spark Context # Important !!!! Use only this method in Ikats to use a spark context spark_context = ScManager.get() results = {} for index, tsuid in enumerate(tsuid_list): LOGGER.info('Processing Quality stats for TS %s (%s/%s)', tsuid, index, len(tsuid_list)) # Generating information about TSUID chunks # ([chunk_index, sd, ed], ...) ts_info = [] for chunk_index in range( _ts_chunk_count(tsuid=tsuid, md_list=md_list, chunk_size=chunk_size)): ts_info.append( _ts_chunk(tsuid=tsuid, index=chunk_index, md_list=md_list, chunk_size=chunk_size)) # Parallelizing information to work with spark # Each chunk can be computed separately, so divided into len(chunks) partitions rdd_ts_info = spark_context.parallelize(ts_info, max(8, len(ts_info))) # RDD containing the list of points values for every chunk of a TSUID # (without timestamps): # ([chunk_index, [[timestamp, value], ...], ...) rdd_ts_dps = rdd_ts_info \ .map(lambda x: (x[0], _ts_read(tsuid=tsuid, start_date=x[1], end_date=x[2]))) # This RDD is used multiple times, caching it to speed up rdd_ts_dps.cache() if compute_value: # Compute metadata related to "value" information result = calc_qual_stats_value(tsuid, rdd_ts_dps, force_save=force_save) # Append to final results if tsuid in results: results[tsuid].update(result[tsuid]) else: results.update(result) if compute_time: # Compute metadata related to "time" information result = calc_qual_stats_time(tsuid, rdd_ts_dps, force_save=force_save) # Append to final results if tsuid in results: results[tsuid].update(result[tsuid]) else: results.update(result) # We don't need the cache anymore rdd_ts_dps.unpersist() except Exception as cause: raise IkatsException("Quality stats failure with ...", cause) finally: ScManager.stop() return ts_list, results
def unwrap_ts_list(ts_list, unit=TSUnit.Radians, discontinuity=None, fid_pattern="%(fid)s__unwrap", use_spark=True): """ Unwrap a list of TS by changing deltas between values to 2*discontinuity complement. Unwrap phase of each TS composing the dataset :param ts_list: list of TSUID to unwrap :param unit: TS unit : "Degrees" or "Radians" (default) :param discontinuity: Maximum discontinuity between values. :param fid_pattern: Pattern of the new FID ('%(fid)s' will be replaced by original FID) :param use_spark: Set to True to use spark. True is default :type ts_list: list :type unit: str or TSUnit :type discontinuity: float or None :type fid_pattern: str :type use_spark: bool :return: a new ts_list :rtype: list :raises TypeError: if input is not well formatted """ if not isinstance(ts_list, list) or len(ts_list) == 0: raise TypeError("ts_list shall be a list having at least one TS") if discontinuity is None: raise ValueError("Discontinuity is not filled") results = [] if use_spark: # Get Spark Context spark_context = ScManager.get() try: # Parallelize 1 TS = 1 partition rdd_ts_list = spark_context.parallelize(ts_list, len(ts_list)) rdd_results = rdd_ts_list.map( lambda x: unwrap_tsuid(tsuid=x["tsuid"], fid=x["funcId"], fid_pattern=fid_pattern, discontinuity=discontinuity, unit=unit)) # Persist data to not recompute them again # (Functional identifier reservation called multiple times through IkatsApi.ts.create_ref) rdd_results.cache() timings = rdd_results.map(lambda x: x[1]).reduce( lambda x, y: x + y) results = rdd_results.map(lambda x: x[0]).collect() rdd_results.unpersist() LOGGER.debug("Unwrapping %s TS using Spark: %s", len(ts_list), timings.stats()) finally: # Stop the context ScManager.stop() else: timings = Timings() for item in ts_list: tsuid = item["tsuid"] fid = item["funcId"] result, tsuid_timings = unwrap_tsuid(tsuid=tsuid, fid=fid, fid_pattern=fid_pattern, discontinuity=discontinuity, unit=unit) results.append(result) timings += tsuid_timings LOGGER.debug("Unwrapping %s TS: %s", len(ts_list), timings.stats()) return results
def spark_ccf(tdm, tsuid_list_or_dataset, lag_max=None, tsuids_out=False, cut_ts=False): """ This function calculates the maximum of the cross correlation function matrix between all ts in **tsuid_list_or_dataset** IN A DISTRIBUTED MODE (using spark) Cross correlation is a correlation between two timeseries whose one is delayed of successive lag values. Result of CCF is a timeseries (correlation function of the lag between timeseries). This function keep the maximum value of the CCF function generated and pull it in the matrix for corresponding timeseries couple. :returns: a string matrix (whose size is equal to the number of tsuids in tsuid_list_or_dataset plus one line and one column for headers) :rtype: ndarray :param tdm: Temporal Data Manager client :param tsuid_list_or_dataset: list of identifiers of the time series or dataset name :param lag_max: maximum lag between timeseries (cf. _ccf function for more details) :param tsuids_out: True to fill headers with tsuids False to fill headers with functional ids :param cut_ts: Cut the TS list to the min-length if set to True :type tdm: TemporalDataMgr :type tsuid_list_or_dataset: list of str or str :type lag_max: positive int :type tsuids_out: boolean :type cut_ts: bool :raises TypeError: if tdm is not a TemporalDataMgr :raises TypeError: if tsuid_list_or_dataset is not a list nor a string :raises TypeError: if tsuids_out is not a boolean """ if type(tdm) is not TemporalDataMgr: raise TypeError("tdm must be a TemporalDataMgr") if type(tsuid_list_or_dataset) is not list and type( tsuid_list_or_dataset) is not str: raise TypeError( "tsuid_list_or_dataset must be a list of string OR a string") if type(tsuids_out) is not bool: raise TypeError("tsuids_out must be a boolean") if type(cut_ts) is not bool: raise TypeError("cut_ts must be a boolean") if type(tsuid_list_or_dataset) is list: # input is a list of tsuid tsuid_list = tsuid_list_or_dataset else: # input is a dataset name dataset = tdm.get_data_set(tsuid_list_or_dataset) tsuid_list = dataset['ts_list'] if tsuids_out: ts_list = tsuid_list else: ts_list = __retrieve_func_id(tdm, tsuid_list) md_list = tdm.get_meta_data(tsuid_list) # initialize size of time series min_ts_size = md_list[tsuid_list[0]]['qual_nb_points'] if cut_ts: for ts in tsuid_list: min_ts_size = min(min_ts_size, md_list[ts]['qual_nb_points']) else: # check time series have same length for ts in tsuid_list: size_ts = md_list[ts]['qual_nb_points'] if size_ts != min_ts_size: raise ValueError('time series do not have same length') # Create or get a spark Context sc = ScManager.get() # Build the RDD with TSUIDS rdd = sc.parallelize(tsuid_list) # Create a broadcast for spark jobs broadcast = sc.broadcast({ "host": tdm.host, "port": tdm.port, "size_of_ts": min_ts_size, "lag_max": lag_max }) # Create an accumulator to store the results of the spark workers accumulator = sc.accumulator(dict(), ListAccumulatorParam()) def run_ccf_spark(working_tsuids): """ Method called by spark job :param working_tsuids: rdd item :type working_tsuids: tuple """ # cross correlation is equal to 1 if timeseries are the same if working_tsuids[0] == working_tsuids[1]: result = 1 else: spark_tdm = TemporalDataMgr(host=broadcast.value['host'], port=broadcast.value['port']) result = __run_max_ccf_ts_list(tdm=spark_tdm, tsuids=list(working_tsuids), size=int( broadcast.value['size_of_ts']), lag_max=broadcast.value['lag_max']) accumulator.add({";".join(list(working_tsuids)): result}) # Get TS content and perform ccf calculation using spark distribution to increase performance # for each element of rdd which is a couple of timeseries # the list of couples is first sorted then duplicates are suppressed to avoid doing same calculation # as for (a,b) and (b,a) rdd.cartesian(rdd).map( lambda x: tuple(sorted(list(x)))).distinct().foreach(run_ccf_spark) # Retrieving result from accumulator to fill matrix result ts_nb = len(tsuid_list) matrix_corr = np.zeros((ts_nb, ts_nb)) for str_couple in accumulator.value: couple = str_couple.split(';') matrix_corr[ tsuid_list.index(couple[0]), tsuid_list.index(couple[1])] = accumulator.value[str_couple] matrix_corr[ tsuid_list.index(couple[1]), tsuid_list.index(couple[0])] = accumulator.value[str_couple] # fill final matrix with headers matrix = __fill_headers_to_final_matrix(matrix_corr, ts_list) return matrix
def correlation_ts_list_loop(ts_list, corr_method, context_meta, variable_meta='metric', config=ConfigCorrelationLoop( the_num_partitions=24, the_point_cache_size=50e6, the_digits_number=4)): """ Computes the correlations between timeseries selected by observed variables and contexts. The observed contexts are defined by the context_meta argument. The variables are defined by variable_meta argument. Assumed: - Each context has a list of distinct variables. - Each timeseries is uniquely associated to one context and one variable. Example with Airbus data: - the *context* is a flight in an Airbus dataset of timeseries. - the *variables* could be metric 'WS1', metric 'WS2' etc. This algorithm is spark-distributed on the cluster. Spark summary ************* - **step 1** The driver prepares a set of configured tuples: each tuple is configured for one context, and has a list of (variable, timeseries reference). Timeseries references are tsuids. - **step 2** A RDD is initialized from the set of cells **'configured tuples'** - **step 3** A new RDD is computed from step 2: each cell **'configured tuple'** is transformed into list of **'correlation inputs'**: this cell is prepared to be processed by the correlation method, for a subpart of the correlation matrice computed for one context At this step, each task task executes: *_spark_combine_pairs()* - **step 4** A new RDD is computed as set of **'correlation result'** cells from cells **'correlations inputs'**: each task will read timeseries pairs, compute the correlation result from selected method (Pearson, ...) At this step, each task task executes: *_spark_correlate_pairs()* - **step 5**: aggregates **'correlation result'** by variable pairs into RDD of **'aggregated correlations'** cells. Each task will 1. creates and saves low-level results CorrelationsByContext into IKATS database, as JSON content. .. seealso:: the JSON is described in the ikats.algo.correlation.data.CorrelationDataset::get_json_friendly_dict() 2. returns **'aggregated correlation'** cells providing - pair of variable indexes - aggregated values: Mean, Variance - saved reference of CorrelationsByContext At this step, each task executes: *_spark_build_corrs_by_context()* - **step 6**: the driver collects the RDD of **'aggregated correlations'**, and computes the high-level result, which is a CorrelationDataset. Finally the JSON generated by CorrelationDataset is returned. :param ts_list: selected timeseries list on which are computed the correlations :type ts_list: list :param corr_method: the method computing the correlation between 2 timeseries. The value must be in CORRELATION_METHODS. Choose PEARSON to apply the pearson correlation. :type corr_method: str :param context_meta: name of the metadata identifying each observed context, where correlations are computed. .. note:: this metadata shall exist for each timeseries, otherwise the latter will be ignored. With Airbus example: 'FlightIdentifier' identifies the flight as observed context. :type context_meta: str :param variable_meta: Optional, with default value 'metric', the name of the metadata identifying the variables. .. note:: this metadata shall exist for each timeseries, otherwise the latter will be ignored. The metadata values will be sorted in a list providing the effective indexes of matrices: the correlation matrix: the N-th index is reserved to the timeseries having the N-th value of this metadata in alphanumeric order. It is advised to keep the default value: this advanced argument must provide distinct indexes for each timeseries under same observed context. :type variable_meta: str :return: JSON-friendly dict grouping - Matrix of means of correlations (see step5) - Matrix of variances of correlations (see step5) - Matrix of references to the JSON content of CorrelationByContext (see step 5) .. seealso:: detailed JSON structure in ikats.algo.correlation.data.CorrelationDataset::get_json_friendly_dict() :rtype: dict as json-friendly structure for json library :raise exception: IkatsException when an error occurred while processing the correlations. """ sc = None try: LOGGER.info("Starting correlation loop ...") LOGGER.info(" - observed contexts based on: %s", context_meta) LOGGER.info(" - variables ordered by: %s", variable_meta) # Check parameters corr_func = CORRELATION_FUNCTIONS.get(corr_method, None) if corr_func is None: msg = "Unknown correlation method from CORRELATION_FUNCTIONS: corr_method={}" raise IkatsException(msg.format(corr_method)) if type(ts_list) is not list: msg = "Unexpected type: list expected for ts_list={}" raise IkatsException(msg.format(msg.format(ts_list))) if type(context_meta) is not str or len(context_meta) == 0: msg = "Unexpected arg value: defined str is expected for context_meta={}" raise IkatsException(msg.format(msg.format(context_meta))) if type(variable_meta) is not str or len(variable_meta) == 0: msg = "Unexpected arg value: defined str is expected for variable_meta={}" raise IkatsException(msg.format(msg.format(variable_meta))) # Hyp: the metadata part can be loaded from the driver ts_metadata_dict = IkatsApi.md.read(ts_list) # Note: the algorithm discards the variables X without Corr(X,Y) for Y different from X # but when X is retained, the final result will present the Corr(X,X) beside the Corr(X,Y) corr_loop_config, sorted_contexts, sorted_variables = _initialize_config_from_meta( ts_metadata_dict, context_meta=context_meta, variable_meta=variable_meta) LOGGER.info("- sorted_contexts=%s", sorted_contexts) LOGGER.info("- sorted_variables=%s", sorted_variables) nb_contexts = len(sorted_contexts) if nb_contexts * len(sorted_variables) == 0: # Algo simply return empty result when there is no variable or no context consistent # # - case 1: case when there is no computable Corr(X, Y) # where variables X and Y are different for the same context # - case 2: missing metadata for context_name => no context # - case 3: missing metadata for ordering_meta => no variable # LOGGER.warning("Empty result from selection=%s", ts_list) obj_empty_result = CorrelationDataset() obj_empty_result.set_contexts(contexts=sorted_contexts, meta_identifier=context_meta) obj_empty_result.set_variables(labels=sorted_variables) obj_empty_result.add_matrix(matrix=[], desc_label="Empty Mean correlation") obj_empty_result.add_matrix( matrix=[], desc_label="Empty Variance correlation") obj_empty_result.add_rid_matrix(matrix=[]) return obj_empty_result.get_json_friendly_dict() # Computes the number of matrix chunks # one matrix chunk will be handled by one task at # ------------------------------------- if nb_contexts < config.num_partitions: # Case when there are fewer contexts than recommended partitions: # - the computing of one matrix is split into several chunks nb_matrix_blocks = ceil(float(config.num_partitions) / nb_contexts) else: nb_matrix_blocks = 1 LOGGER.info("- number of matrix blocks by context=%s", nb_matrix_blocks) # Computes the timeseries LRU cache size used by one task # ------------------------------------------------------- # 1/ retrieve nb points for each TS, default value is assumed to be 1e6 in order to be robust # in case 'qual_nb_points' is not available, (should not happen ...) defined_nb_points = [ int(v.get('qual_nb_points', 1e6)) for v in ts_metadata_dict.values() ] # 2/ evaluate the number of points by one task carrying one matrice chunk total_nb_points_by_ctx = sum( defined_nb_points) / nb_contexts / nb_matrix_blocks if config.the_point_cache_size >= total_nb_points_by_ctx: # the best condition: # system will memorize in the cache every loaded ts under the same matrice ts_cache_size = len(sorted_variables) else: # the case when it is required to limit the number TS memorized in the cache, # under the same row of correlation matrice # Note: len(sorted_variables) == max size of correlation row == dim matrice ts_cache_size = config.the_point_cache_size / total_nb_points_by_ctx * len( sorted_variables) ts_cache_size = ceil(max(2.0, ts_cache_size)) LOGGER.info("- ts_cache_size=%s", ts_cache_size) # release ts_metadata_dict from memory ts_metadata_dict = None sc = ScManager.get() # Spark_step_1: initialize the RDD # ------------ # OUTPUT: RDD of ( <context index>, [ (<var index 1> , <tsuid 1>), ..., (<var index N> , <tsuid N>) ] ) rdd_initial_config = sc.parallelize(corr_loop_config, config.num_partitions) # Spark_step_2: combinate the pairs of timeseries by contexts and by chunks # ------------ # INPUT: RDD of ( <context index>, [ (<var index 1> , <tsuid 1>), ..., (<var index N> , <tsuid N>) ] ) # OUTPUT: RDD of ( <context_index>, [ <pair 1_2>, <pair 1_3>, ..., <pair M_N> ] ) # # where <pair X_Y> is ((<var X index>, <tsuid X> ), (<var Y index>, <tsuid Y>)) # # PROCESS: computes the cartesian product and split the list of pairs into smaller-sized lists # rdd_var_combinations = rdd_initial_config.flatMap( lambda x: _spark_combine_pairs(context=x[0], variables=x[1], nb_corr_matrix_blocks= nb_matrix_blocks)) if nb_matrix_blocks > 1: # reshuffles all the data over the cluster ... rdd_var_combinations = rdd_var_combinations.repartition( nb_contexts * nb_matrix_blocks) # Spark_step_3: computes the correlations # ------------ # INPUT: RDD of ( <context_index>, [ <pair 1_2>, <pair 1_3>, ..., <pair M_N> ] ) # OUTPUT: RDD of ( (<var X index>, <var Y index>), <computed corr X_Y> ) # # where # <computed corr X_Y> is (<context>, (<tsuid X>, <tsuid Y>), correlation) # # PROCESS: computes the correlations on the timeseries associated to the variables # rdd_correlations = rdd_var_combinations.flatMap( lambda x: _spark_correlate_pairs(context=x[0], var_pairs=x[1], corr_method=corr_method, ts_cache_size=ts_cache_size)) # generates the parent_id: # presently this identifier may be used by Postgres admin, # to group the low-level results attached to the same high-level result # => at the moment a label including a timestamp is generated obj_result = CorrelationDataset() parent_id = obj_result.get_id() def r_append(data, computed_corr): """ Append computed correlation to data :param data: :param computed_corr: :return: """ data.append(computed_corr) return data def r_merge(one, two): """ Merge two to one :param one: :param two: :return: """ one.extend(two) return one # Spark_step_4: aggregate the correlations by pair of variables # ------------ # INPUT: RDD of ( (<var X index>, <var Y index>), <computed corr X_Y> ) as described previously # # OUTPUT: RDD of ( (<var X index>, <var Y index>), list of tuples: # (<context index>, (tsuid_X, tsuid_Y), <correlation result> ) # ) # PROCESS: aggregates by key=(<var X index>, <var Y index>) the correlation information profiles, # enhanced with tsuid pairs # rdd_agg_correlations = rdd_correlations.aggregateByKey( zeroValue=[], seqFunc=r_append, combFunc=r_merge) # Spark_step_5: # ------------ # INPUT: RDD of ( (<var X index>, <var Y index>), list of tuples: # (<context index>, (tsuid_X, tsuid_Y), <correlation result> ) # ) # # OUTPUT: RDD of ( ( <var X index>, <var Y index>), <low-level Result ID>, <Mean correlation>, <Var correlation> # ) # PROCESS: - creates and saves aggregated low-level results as CorrelationsByContext # - computes Mean and Variance of low-level results # - returns summarized info: Mean+Variance+ result ID rdd_results_corr_by_context = \ rdd_agg_correlations.map(lambda x: (_spark_build_corrs_by_context(variables=x[0], agg_ctx_ts_corr=x[1], desc_context=context_meta, sorted_variables=sorted_variables, sorted_contexts=sorted_contexts, corr_method=corr_method, parent_id=parent_id, ndigits=config.the_digits_number))) # Spark_step_6: # ------------ # # 6.1: collects # # INPUT: RDD of ( [ <var X index>, <var Y index>], <processdata ID>, <Mean(corr)>, <Var(corr)> # ) # # OUTPUT: collected list # # PROCESS: collects high-level results # collected_results_corr = rdd_results_corr_by_context.collect() # 6.2: prepare the result # # - Encodes the returned json-friendly content from the collected high-level results # - returns the result # matrix_mean = get_triangular_matrix(dim=len(sorted_variables), default_value_diag=1.0, default_value_other=None) matrix_variance = get_triangular_matrix(dim=len(sorted_variables), default_value_diag=0.0, default_value_other=None) matrix_id = get_triangular_matrix(dim=len(sorted_variables), default_value_diag=None, default_value_other=None) for var_index_pair, data_oid, mean, variance in collected_results_corr: var_index_row = var_index_pair[0] var_index_col = var_index_pair[1] # required: recomputes the range of cell in its row # triangular matrix => cell(i,j) is at position j-i of the row triangular_matrix[i] matrix_mean[var_index_row][var_index_col - var_index_row] = mean matrix_variance[var_index_row][var_index_col - var_index_row] = variance matrix_id[var_index_row][var_index_col - var_index_row] = data_oid obj_result.set_contexts(contexts=sorted_contexts, meta_identifier=context_meta) obj_result.set_variables(sorted_variables) obj_result.add_matrix(matrix=matrix_mean, desc_label="Mean Correlation") obj_result.add_matrix(matrix=matrix_variance, desc_label="Variance") obj_result.add_rid_matrix(matrix_id) LOGGER.info("... ended correlation loop.") return obj_result.get_json_friendly_dict() except Exception: LOGGER.error("... ended correlation loop with error.") raise IkatsException("Failed execution: correlation_ts_loop()") finally: if sc: ScManager.stop()
def test_sliding_window_recovery(self): """ Testing the recovery parameter. """ sax_info = ConfigSax(paa=3, sequences_size=6, with_mean=True, with_std=True, global_norm=False, local_norm=False, linear_filter=False, recovery=0.5, coefficients=[1, 1], alphabet_size=6) ts_name = ["linear_time_serie"] spark_ctx = ScManager.get() # Test with recovery = 0.5 result, _ = sliding_windows(ts_list=ts_name, sax_info=sax_info, spark_ctx=spark_ctx) result = result.collect() # 2 sequences in the timeseries => 3 sequences at the end self.assertEqual(len(result), 3) # Test with MAX recovery # recovery = 1 (the maximum : 100 % <=> the next window start one point to the right) sax_info.recovery = 1.0 result, _ = sliding_windows(ts_list=ts_name, sax_info=sax_info, spark_ctx=spark_ctx) result = result.collect() # remember that in 'sliding_window' function, we call 'get_ts_mock(ts_name)[0]' ts = get_ts_mock(ts_name)[0] ts_val_0 = list(ts[0:6][:, 1]) ts_val_1 = list(ts[6:12][:, 1]) timestamp_0 = list(ts[0:6][:, 0]) timestamp_1 = list(ts[6:12][:, 0]) # Check the timestamp and the values of the two sequences # result[i] = (key, list([timestamps, values],[,],...)) # check ts value condition = (np.all(result[i][1][:, 1] in ts_val_0 for i in range(len(result))) or np.all(result[i][1][:, 1] in ts_val_1 for i in range(len(result)))) self.assertTrue(condition) # check timestamps condition = (np.all(result[i][1][:, 0] in timestamp_0 for i in range(len(result))) or np.all(result[i][1][:, 0] in timestamp_1 for i in range(len(result)))) self.assertTrue(condition) # Test with MINIMUM recovery # recovery = 0 (no recovery) sax_info.recovery = 0.01 result2, _ = sliding_windows(ts_list=ts_name, sax_info=sax_info, spark_ctx=spark_ctx) result2 = result2.collect() # 2 sequences in the timeseries => 2 sequences self.assertEqual(len(result2), 2)