예제 #1
0
    def setUpClass(cls):
        """
        needs environment vars to be explicitly set :
        SPARK_HOME
        and
        PYSPARK_PYTHON
        """
        if os.getenv("PYSPARK_PYTHON") is None:
            assert "env PYSPARK_PYTHON must be defined"
        if os.getenv("SPARK_HOME") is None:
            assert "env SPARK_HOME must be defined"

        # Create a spark Context
        ScManager.create()
예제 #2
0
    def test_collision_same_words(self):
        """
        The words are all the same
        """

        sc = ScManager.get()

        sax_result = SaxResult(paa=sc.parallelize([]),
                               breakpoints=[],
                               sax_word='abcdabcdabcdabcd')
        sax, _, _ = sax_result.start_sax(4, spark_ctx=sc)
        sequences_size = np.array(sax.collect()).shape[1]
        result, _ = final_collision_matrix(sax=sax,
                                           number_of_iterations=6,
                                           index_selected=2,
                                           word_len=sequences_size,
                                           spark_ctx=sc)

        result = result.data

        # exactly the same words => six cells of maximum of combinations
        nb_cell = 0
        for i in result:
            if i[0] == 6:
                nb_cell += 1
        self.assertEqual(nb_cell, 6)
예제 #3
0
    def test_sw_sax_limit_constant(self):
        """
        Test sliding window and SAX on a constant timeseries with two greater values
        """
        sax_info = ConfigSax(paa=10,
                             sequences_size=10,
                             with_mean=True,
                             with_std=True,
                             global_norm=False,
                             local_norm=False,
                             linear_filter=False,
                             recovery=0.5,
                             coefficients=[0.1, 0.9],
                             alphabet_size=5)

        spark_ctx = ScManager.get()

        result, _ = sliding_windows(ts_list=["specific_time_serie"],
                                    sax_info=sax_info,
                                    spark_ctx=spark_ctx)

        print("result={}".format(result.collect()))

        sax_result = run_sax_on_sequences(rdd_sequences_data=result,
                                          paa=sax_info.paa,
                                          alphabet_size=sax_info.alphabet_size)

        print("sax_word={}".format(sax_result.sax_word))
        # PAA_value = 0 => 'c'
        # PAA_value = 10 => 'e' or 'd'
        # PAA_value = -10 => 'a' or 'b'
        self.assertTrue(sax_result.sax_word is 'ccccccccae'
                        or sax_result.sax_word is 'ccccccccbd')
예제 #4
0
    def test_sliding_window_sax_basic(self):
        """
        Test the nominal case
        """
        sax_info = ConfigSax(paa=3,
                             sequences_size=6,
                             with_mean=True,
                             with_std=True,
                             global_norm=False,
                             local_norm=False,
                             linear_filter=False,
                             recovery=0.5,
                             coefficients=[0.1, 0.9],
                             alphabet_size=3)

        spark_ctx = ScManager.get()
        result, _ = sliding_windows(ts_list=["linear_time_serie"],
                                    sax_info=sax_info,
                                    spark_ctx=spark_ctx)

        sax_result = run_sax_on_sequences(rdd_sequences_data=result,
                                          paa=sax_info.paa,
                                          alphabet_size=sax_info.alphabet_size)

        # recovery = 0.5 and word_size = 3 => sax_result = 'aab abc bcc'
        self.assertEqual(sax_result.sax_word, 'aababcbcc')
예제 #5
0
    def test_coll_various_words(self):
        """
        Test the collision matrix for same and different words
        The words 0 and 3 are the same, the words 1 and 2 too
        """

        nb_paa = 5
        nb_index = 2
        sc = ScManager.get()
        sax_result = SaxResult(paa=sc.parallelize([]),
                               breakpoints=[],
                               sax_word=''.join(
                                   ['ababa', 'cdcdc', 'cdcdc', 'ababa']))

        sax, _, _ = sax_result.start_sax(nb_paa, spark_ctx=sc)
        sequences_size = np.array(sax.collect()).shape[1]
        result, _ = final_collision_matrix(sax=sax,
                                           number_of_iterations=int(
                                               binom(nb_paa, nb_index)),
                                           index_selected=nb_index,
                                           word_len=sequences_size,
                                           spark_ctx=sc)
        result = result.data
        result.sort(key=lambda x: "{}-{}-{}".format(int(x[0]), int(x[1][0]),
                                                    int(x[1][1])))
        print(result)
        # the maximum of possible combinations without repetitions is 10
        # two cells of 10 : one for the occurrences between the words 1 and 2, and another for the words 0 and 3
        for i in range(2):
            self.assertTrue(result[i][0] == 10)
        self.assertTrue(
            int(result[0][1][0]) == 2 and int(result[0][1][1]) == 1)
        self.assertTrue(
            int(result[1][1][0]) == 3 and int(result[1][1][1]) == 0)
예제 #6
0
    def _run_all_in_master_memory(self, method):
        """
        Run the spark pearson correlation by loading all the TS content (ie. values) in master memory

        Each coefficient will be computed by a worker (Spark decides the best choice to apply)
        """

        # Create or get a spark Context
        spark_context = ScManager.get()

        # Get TS content
        rdd_content = self._get_ts(spark_context)

        # Job distribution is made by Statistics.corr (Spark correlation matrix calculation)
        self.results = Statistics.corr(rdd_content, method=method)

        ScManager.stop()
예제 #7
0
    def _apply_motif_iter_zero_coll(self, activate_spark):
        """
        Test
         - with the iterative method to search the neighborhood motif,
         - with/without spark jobs
         - and where the words are all different => no collisions
        """
        spark_context = ScManager.get()
        # Build the SAX result with different words, and small breakpoints
        sax_result = SaxResult(paa=spark_context.parallelize([]),
                               breakpoints=[-0.3, -0.1, 0.1, 0.3],
                               sax_word='abcdebcdeacdeabdeabceabcd')
        sax, _, nb_seq = sax_result.start_sax(5, spark_ctx=spark_context)
        # sax is an rdd -> to np.array
        sax = np.transpose(sax.collect())

        breakpoint = sax_result.build_mindist_lookup_table(nb_seq)

        # Different words => only zero cells in the collision matrix
        collision_matrix = SparseMatrix(np.zeros((nb_seq, nb_seq)))

        # Build the class for motif search
        search_info = NeighborhoodSearch(size_sequence=20,
                                         mindist_lookup_table=breakpoint,
                                         alphabet_size=5,
                                         sax=np.transpose(sax),
                                         radius=1000,
                                         collision_matrix=collision_matrix)

        recognition_info = ConfigRecognition(
            is_stopped_by_eq9=True,
            iterations=100,
            min_value=1,
            is_algo_method_global=False,
            activate_spark=activate_spark,
            radius=1000,
            neighborhood_method=OPT_USING_BRUTE_FORCE)

        # neighborhood_method=OPT_USING_BRUTE_FORCE
        result = search_info.motif_neighborhood_iterative(30, recognition_info)

        # There is no similar sequences
        self.assertEqual(len(result), 0)

        # neighborhood_method=OPT_USING_COLLISIONS
        recognition_info.neighborhood_method = OPT_USING_COLLISIONS
        result = search_info.motif_neighborhood_iterative(30, recognition_info)

        # There is no similar sequences
        self.assertEqual(len(result), 0)
예제 #8
0
    def __init__(self, tdm, ts_load_split_size=10):
        """
        init the spark distance class

        :param tdm: the temporal data manager client
        :type tdm: TemporalDataMgr

        :param ts_load_split_size: size of TS packet to load from TDM
        :type ts_load_split_size: int

        """

        self.tdm = tdm
        self.ts_load_split_size = ts_load_split_size
        self.spark_context = ScManager.get()

        self.logger = logging.getLogger(__name__)
예제 #9
0
    def test_coll_near_same_words(self):
        """
        The words have 1, or 2, or 3, or 4 occurrences, but there are not exactly the same because words have five
        letters
        """
        nb_paa = 5
        nb_index = 2
        sc = ScManager.get()
        sax_result = SaxResult(
            paa=sc.parallelize([]),
            breakpoints=[],
            sax_word=''.join(['aaaaa', 'abbbb', 'abccc', 'abcdd', 'abcde']))

        sax, _, _ = sax_result.start_sax(nb_paa, spark_ctx=sc)
        sequences_size = np.array(sax.collect()).shape[1]
        result, _ = final_collision_matrix(sax=sax,
                                           number_of_iterations=int(
                                               binom(nb_paa, nb_index)),
                                           index_selected=nb_index,
                                           word_len=sequences_size,
                                           spark_ctx=sc)

        # sorted result list
        result = result.data
        result.sort(key=lambda x: "{}-{}-{}".format(int(x[0]), int(x[1][0]),
                                                    int(x[1][1])))
        print(result)

        # sorted list expected:
        expected_result = [(1.0, (2, 1)), (1.0, (3, 1)), (3.0, (3, 2)),
                           (1.0, (4, 1)), (3.0, (4, 2)), (6.0, (4, 3))]
        expected_result.sort(key=lambda x: "{}-{}-{}".format(
            int(x[0]), int(x[1][0]), int(x[1][1])))

        self.assertEqual(len(result), len(expected_result))
        for expected_item, res_item in zip(expected_result, result):
            self.assertEqual(expected_item[0], res_item[0], 'nb collisions')
            self.assertEqual(expected_item[1][0], res_item[1][0],
                             'seq index left-side')
            self.assertEqual(expected_item[1][1], res_item[1][1],
                             'seq index right-side')
예제 #10
0
    def test_sliding_window_filter(self):
        """
        Testing linear filter.
        """
        sax_info = ConfigSax(paa=3,
                             sequences_size=6,
                             with_mean=True,
                             with_std=True,
                             global_norm=False,
                             local_norm=False,
                             linear_filter=True,
                             recovery=0.5,
                             coefficients=[1, 0.5],
                             alphabet_size=6)

        spark_ctx = ScManager.get()
        # Test for linear sequences
        result, _ = sliding_windows(ts_list=["linear_time_serie"],
                                    sax_info=sax_info,
                                    spark_ctx=spark_ctx)

        result = result.collect()
        # all sequences are linear => no sequence
        self.assertEqual(len(result), 0)

        # Test for constant sequences with a maximum recovery (= 0 => no overlap between sequences)
        sax_info.coefficients = [0, 1]
        sax_info.recovery = 0
        result, _ = sliding_windows(ts_list=["ts_with_constant_pattern"],
                                    sax_info=sax_info,
                                    spark_ctx=spark_ctx)
        result = result.collect()
        LOGGER.info("result=%s", result)
        LOGGER.info("ts_init=%s", get_ts_mock("ts_with_constant_pattern"))
        # Sequence of 12 pts, recovery = 0 (no recovery) -> 2 sequences
        self.assertEqual(len(result), 2)
예제 #11
0
    def test_collision_different_words(self):
        """
        The words are all different
        """
        nb_paa = 5
        nb_index = 2
        sc = ScManager.get()
        sax_result = SaxResult(
            paa=sc.parallelize([]),
            breakpoints=[],
            sax_word=''.join(['abcde', 'fghij', 'klmno', 'pqrst', 'uvwxy']))

        sax, _, _ = sax_result.start_sax(nb_paa, spark_ctx=sc)
        sequences_size = np.array(sax.collect()).shape[1]
        result, _ = final_collision_matrix(sax=sax,
                                           number_of_iterations=int(
                                               binom(nb_paa, nb_index)),
                                           index_selected=nb_index,
                                           word_len=sequences_size,
                                           spark_ctx=sc)
        result = result.data

        # different words => only zero cells in the matrix
        self.assertTrue(len(result) is 0)
예제 #12
0
    def test_sax(self):
        """
        Test with no calculate the PAA (4 PAA for 4 points in a sequence) and the PAA are equidistants
        """
        sax_info = ConfigSax(paa=4,
                             sequences_size=4,
                             with_mean=True,
                             with_std=True,
                             global_norm=False,
                             local_norm=False,
                             linear_filter=False,
                             recovery=0.5,
                             coefficients=[0.1, 0.9],
                             alphabet_size=4)
        spark_ctx = ScManager.get()
        result, _ = sliding_windows(
            ts_list=["simple_sequences_ts0", "simple_sequences_ts1"],
            sax_info=sax_info,
            spark_ctx=spark_ctx)

        LOGGER.info("sliding_windows done!")

        sax_result = run_sax_on_sequences(rdd_sequences_data=result,
                                          paa=sax_info.paa,
                                          alphabet_size=sax_info.alphabet_size)

        result = result.collect()
        LOGGER.info("sax_result=%s", sax_result)
        LOGGER.info("result=%s", result)

        # the PAA : [[4, 4, 0, 2], [-2, 2, -2, 0]]
        self.assertEqual(sax_result.paa.collect(), [4, 4, 0, 2, -2, 2, -2, 0])
        # the result expected : 'ddbc acab'
        self.assertEqual(sax_result.sax_word, 'ddbcacab')

        # Test with calculate the PAA
        sax_info = ConfigSax(paa=4,
                             sequences_size=12,
                             with_mean=True,
                             with_std=True,
                             global_norm=False,
                             local_norm=False,
                             linear_filter=False,
                             recovery=0.5,
                             coefficients=[0.1, 0.9],
                             alphabet_size=4)

        result, _ = sliding_windows(
            ts_list=["sequences_1_ts0", "sequences_1_ts1"],
            sax_info=sax_info,
            spark_ctx=spark_ctx)

        sax_result = run_sax_on_sequences(rdd_sequences_data=result,
                                          paa=sax_info.paa,
                                          alphabet_size=sax_info.alphabet_size)

        # the PAA : [[1, 4, -2, 1], [4, -2, -3, -3]]
        self.assertEqual(sax_result.paa.collect(),
                         [1, 4, -2, 1, 4, -2, -3, -3])
        # the result expected : 'cdbc dbaa'
        self.assertEqual(sax_result.sax_word, 'cdbcdbaa')
예제 #13
0
파일: sax.py 프로젝트: IKATS/op-sax
def run_sax_from_ts_list(ts_list,
                         alphabet_size,
                         word_size,
                         normalize=False,
                         activate_spark=None):
    """
    Perform the Symbolic Aggregate Approximation (SAX) on the TSUID list provided in **ts_list**

    Use spark if necessary

    .. note::
        If spark fails. The local computation will be performed

    :param ts_list: tsuid list of the TS to calculate the PAA timeseries
    :type ts_list: list

    :param alphabet_size: number of characters in result word
    :type alphabet_size: int

    :param word_size: number of segments
    :type word_size: int

    :param activate_spark: True to force spark, False to force local, None to let the algorithm decide
    :type activate_spark: bool or none

    :param normalize: Apply the normalization of the TS if True (False:default)
    :type normalize: bool

    :return: A list of dict composed of the PAA result, the SAX breakpoints, the SAX string and the points for all TSUID
    :rtype: list
    """

    results = {}

    # Define if spark is necessary
    if activate_spark is None:

        md = IkatsApi.md.read(ts_list=ts_list)
        sum_points = 0
        for tsuid in md:
            if 'qual_nb_points' in md[tsuid]:
                sum_points += float(md[tsuid]['qual_nb_points'])
            else:
                # No information on number of points, consider using spark
                sum_points = 0
                break
        spark_nb_points_trigger = 1E5
        if sum_points == 0 or sum_points / len(
                ts_list) > spark_nb_points_trigger:
            # Spark is active if the average number of points per TS is greater than spark_nb_points_trigger points
            activate_spark = True

    if activate_spark:
        LOGGER.info("Running SAX using Spark")

        # Create or get a spark Context
        spark_context = ScManager.get()

        # Build the RDD with TSUIDS
        rdd = spark_context.parallelize(ts_list)

        # Create a broadcast for spark jobs
        broadcast = spark_context.broadcast({
            "alphabet_size": alphabet_size,
            "word_size": word_size,
            "normalize": normalize,
        })

        # Create an accumulator to store the results of the spark workers
        accumulator = spark_context.accumulator(dict(), ListAccumulatorParam())

        def run_sax_spark(working_tsuid):
            """
            Method called by spark job

            :param working_tsuid: rdd item
            """

            results = run_sax_from_tsuid(
                tsuid=working_tsuid,
                alphabet_size=broadcast.value['alphabet_size'],
                word_size=broadcast.value['word_size'],
                normalize=broadcast.value['normalize'])

            accumulator.add({working_tsuid: results})

        # Get TS content using spark distribution to increase performance
        # noinspection PyBroadException
        try:
            rdd.foreach(run_sax_spark)
        except Exception:
            LOGGER.warning(
                'Something wrong with spark, Using Local Computation')
            activate_spark = False

        for ts in ts_list:
            if ts in accumulator.value:
                results[ts] = accumulator.value[ts]
            else:
                LOGGER.warning(
                    "TS %s has encountered an issue during the spark distribution",
                    ts)

        ScManager.stop()

    if not activate_spark:
        LOGGER.info("Running SAX on single instance")

        for ts in ts_list:
            results[ts] = run_sax_from_tsuid(tsuid=ts,
                                             alphabet_size=alphabet_size,
                                             word_size=word_size,
                                             normalize=normalize)

            # print("TS=%s\nnorm=%s\nr=%s\n\n"%(ts,normalize,results[ts]['sax_breakpoints'][0]))

    return results
예제 #14
0
    def test_sliding_window_norm(self):
        """
        Testing global and local norm.
        """
        epsilon = 1.0e-10
        # recovery = 1 (no recovery) -> 3 seq of 4 points (nb_points = 12)
        sax_info = ConfigSax(paa=3,
                             sequences_size=4,
                             with_mean=True,
                             with_std=True,
                             global_norm=True,
                             local_norm=False,
                             linear_filter=False,
                             recovery=0,
                             coefficients=[0.1, 1],
                             alphabet_size=6)

        spark_ctx = ScManager.get()
        # Test with global normalization : the timeseries is normalized
        result, coeff = sliding_windows(ts_list=["linear_time_serie"],
                                        sax_info=sax_info,
                                        spark_ctx=spark_ctx)

        result = result.collect()
        coeff = coeff.collect()
        # Check coeff : coeff is the mean and variance of each sequence

        # 12 points no recovery (recovery=0) -> 3 seq of 4 points
        self.assertEqual(len(coeff), 3)

        # ts_value is an array with the sequences values
        ts_value = np.array([])
        for i, _ in enumerate(result):
            # result[i] = (key, list([timestamps, values],[,],...))
            ts_value = np.concatenate((result[i][1][:, 1], ts_value))

        LOGGER.info("result=%s", result)
        # no recovery => 2 seq * 6 points = 12 values = npoints
        self.assertEqual(len(ts_value), 12)

        LOGGER.info("ts_std=%s", (ts_value.std()))
        LOGGER.info("ts_mean=%s", np.mean(ts_value))
        # global normalisation => ts_value have a standard deviation of 1 and mean if 0
        self.assertTrue(1 - epsilon < np.std(ts_value) < 1 + epsilon)
        self.assertTrue(-epsilon < np.mean(ts_value) < epsilon)

        # Test with local normalization : all the sequences are normalized
        sax_info.global_norm = False
        sax_info.local_norm = True
        sax_info.linear_filter = True

        # Recovery = 1 : maximum recovery
        sax_info.recovery = 1
        result, coeff = sliding_windows(ts_list=["ts_with_constant_pattern"],
                                        sax_info=sax_info,
                                        spark_ctx=spark_ctx)
        result = result.collect()

        # Verify that each sequence are normalized
        for i, _ in enumerate(result):
            # result[i] = (key, list([timestamps, values],[,],...))
            seq_value = result[i][1][:, 1]
            self.assertTrue(1 - epsilon < np.std(seq_value) < 1 + epsilon)
            self.assertTrue(-epsilon < np.mean(seq_value) < epsilon)
예제 #15
0
def cut_ds_from_metric(ds_name,
                       metric,
                       criteria,
                       group_by=None,
                       fid_pattern=None,
                       chunk_size=75000):
    """
    Entry point of the method that cut a dataset based on the criteria applied to the TS matching the metric

    The criteria expression is a python expression that will be converted to a lambda expression with 'M' used as metric
    value.
    Example: "M > 7 and M not in [1,2,6]"

    :param ds_name: name of the dataset to use
    :param metric: metric used as reference to find cut ranges
    :param criteria: criteria expression describing the value thresholds.
    :param group_by: metadata to iterate on each value (Default to None to not use this behaviour)
    :param fid_pattern: name of the generated TS.
                        Variables can be used:
                        - {fid}   : Functional identifier
                        - {M}     : metric
    :param chunk_size: Size of the ideal chunk (in number of points per chunk)

    :type ds_name: str
    :type metric: str
    :type criteria: str
    :type group_by: str or None
    :type fid_pattern: str
    :type chunk_size: int

    :return: the ts list of the generated TS. [{"funcId": "xx", "tsuid":"xx"}]
    :rtype: list

    :raises ValueError: if dataset is empty
    :raises ValueError: if metric is found several times in dataset
    :raises ValueError: if metric is not found in dataset
    :raises ValueError: if group_by doesn't have a matching reference
    :raises KeyError: if error in fid_pattern
    """

    # List of TS present in dataset
    ts_list = IkatsApi.ds.read(ds_name=ds_name)['ts_list']

    if len(ts_list) == 0:
        LOGGER.error("Dataset %s is empty", ds_name)
        raise ValueError("Dataset %s is empty" % ds_name)

    # Get all the metadata
    md_list = IkatsApi.md.read(ts_list=ts_list)

    # List of all possible values encountered for the group by
    groups_list = None
    if group_by not in [None, ""]:
        # Get all the groups for this group by criterion
        groups_list = _find_all_groups(group_by, md_list)
        LOGGER.info("%s groups found for [%s]", len(groups_list), group_by)
    else:
        # Force to None
        group_by = None

    # Find the reference TS and all TS to cut using this ref
    grouped_ts_list = _find_ts_ref_group(ds_name=ds_name,
                                         md_list=md_list,
                                         metric=metric,
                                         ts_list=ts_list,
                                         group_by=group_by,
                                         group_by_list=groups_list)

    # Get Spark Context
    # Important !!!! Use only this method in Ikats to use a spark context
    spark_context = ScManager.get()

    try:
        result = []

        # For each group (processed in alphabetic order)
        for group in sorted(grouped_ts_list):
            result_iter = _cut_from_metric_for_group(
                chunk_size=chunk_size,
                criteria=criteria,
                ds_name=ds_name,
                fid_pattern=fid_pattern,
                md_list=md_list,
                metric=metric,
                spark_context=spark_context,
                group=grouped_ts_list[group])

            # Sort functional identifiers alphabetically)
            result.extend(sorted(result_iter, key=lambda x: x['funcId']))

        return result
    finally:
        ScManager.stop()
예제 #16
0
파일: slope.py 프로젝트: IKATS/op-slope
def compute_slope(ts_list,
                  fid_suffix="_slope",
                  chunk_size=75000,
                  save_new_ts=True):
    """
    Compute the slope of a list of timeseries using spark

    This implementation computes slope for one TS at a time in a loop.
    To know the details of the computation, see the corresponding method

    :param ts_list: list of TS. Each item is a dict composed of a TSUID and a functional id
    :param fid_suffix: Functional identifier suffix of the final timeseries
    :param chunk_size: Number of points per chunk (assuming the TS is periodic)
    :param save_new_ts: True (default) if TS must be saved to database

    :type ts_list: list of dict
    :type fid_suffix: str
    :type chunk_size: int
    :type save_new_ts: bool

    :return: the new list of derived TS (same order as input)
    :rtype: list of dict

    :raise TypeError: if ts_list type is incompatible
    """

    # Check inputs
    if not isinstance(ts_list, list):
        raise TypeError("ts_list shall be a list")
    if len(ts_list) == 0:
        raise TypeError("ts_list must have at least one element")

    LOGGER.info('Computing Slope for %s TS', len(ts_list))

    tsuid_list = ts_list
    try:
        # Extract TSUID from ts_list
        tsuid_list = [x['tsuid'] for x in ts_list]
    except Exception:
        # Already a tsuid_list.
        # Getting the functional id for each ts
        ts_list = [{
            'tsuid': x,
            'funcId': IkatsApi.fid.read(x)
        } for x in ts_list]

    # Gather all metadata for the list of TS to compute slope
    md_list = IkatsApi.md.read(tsuid_list)

    # Results will be stored here
    results = []

    try:
        # Get Spark Context
        spark_context = ScManager.get()

        for index, tsuid in enumerate(tsuid_list):
            fid = [x['funcId'] for x in ts_list if x['tsuid'] == tsuid][0]
            LOGGER.info('Processing Slope for TS %s (%s/%s) (%s)', fid,
                        (index + 1), len(tsuid_list), tsuid)

            computed_tsuid, computed_fid = compute_slope_for_tsuid(
                spark_context=spark_context,
                fid=fid,
                fid_suffix=fid_suffix,
                tsuid=tsuid,
                md_list=md_list,
                chunk_size=chunk_size,
                save_new_ts=save_new_ts)

            # Append results to final results
            results.append({"tsuid": computed_tsuid, "funcId": computed_fid})
    except Exception:
        raise
    finally:
        # Stop spark context in all cases
        ScManager.stop()

    return results
예제 #17
0
def _resample(resampling_way,
              ts_list,
              resampling_period,
              adding_method=AddingMethod.LINEAR_INTERPOLATION,
              timestamp_position=TimestampPosition.BEG,
              aggregation_method=AggregationMethod.AVG,
              nb_points_by_chunk=50000,
              generate_metadata=False):
    """
    Function that effectively resamples (UP or DOWN according to resampling_way value) using Spark

    :param resampling_way: way of resampling (UP or DOWN)
    :type ts_list: ResamplingWay

    :param ts_list: list composing the TS information to resample [{'tsuid': xxx, 'funcId': yyy },...]
    :type ts_list: list of dict

    :param resampling_period: target period for resampling (in ms)
    :type resampling_period: int

    :param adding_method: Method to use for interpolation (see type AddingMethod for more information)
    :type adding_method: AddingMethod or str or int

    :param timestamp_position: timestamp position in the interval while downsampling
    :type timestamp_position: str ('BEG','MID','END')

    :param aggregation_method: aggregation method for downsampling
    :type aggregation_method: str ('MIN','MAX','MED','AVG','FIRST','LAST')

    :param nb_points_by_chunk: user defined number of points used for a spark chunk of data (after resampling)
    :type nb_points_by_chunk: int

    :param generate_metadata: True to generate metadata on-the-fly (ikats_start_date, ikats_end_date, qual_nb_points)
    :type generate_metadata: boolean (default : False)

    :returns: a list of dict [{'tsuid': xxx, 'funcId': yyy },...]
    :rtype: list of dict
    """

    if ts_list == []:
        return []

    fid_dict = dict()
    for ts in ts_list:
        fid_dict[ts['funcId']] = ts['tsuid']

    # List of chunks of data and associated information to parallelize with Spark
    data_to_compute = []

    # Extract tsuid list from inputs

    tsuid_list = [x["tsuid"] for x in ts_list]

    # Checking metadata availability before starting resampling
    meta_list = IkatsApi.md.read(tsuid_list)

    # Collecting information from metadata
    for tsuid in tsuid_list:
        if tsuid not in meta_list:
            LOGGER.error("Timeseries %s : no metadata found in base", tsuid)
            raise ValueError("No ikats metadata available for resampling %s" %
                             tsuid)
        if 'ikats_start_date' not in meta_list[tsuid]:
            # Metadata not found
            LOGGER.error(
                "Metadata 'ikats_start_date' for timeseries %s not found in base",
                tsuid)
            raise ValueError("No start date available for resampling [%s]" %
                             tsuid)
        if 'ikats_end_date' not in meta_list[tsuid]:
            # Metadata not found
            LOGGER.error(
                "meta data 'ikats_end_date' for timeseries %s not found in base",
                tsuid)
            raise ValueError("No end date available for resampling [%s]" %
                             tsuid)
        if 'qual_ref_period' not in meta_list[tsuid]:
            # Metadata not found
            LOGGER.error(
                "Metadata qual_ref_period' for timeseries %s not found in base",
                tsuid)
            raise ValueError(
                "No reference period available for resampling [%s]" % tsuid)

        # Original timeseries information retrieved from metadata
        sd = int(meta_list[tsuid]['ikats_start_date'])
        ed = int(meta_list[tsuid]['ikats_end_date'])
        ref_period = int(float(meta_list[tsuid]['qual_ref_period']))

        # Get the functional identifier of the original timeseries
        fid_origin = [x['funcId'] for x in ts_list if x['tsuid'] == tsuid][0]

        # Generate functional id for resulting timeseries
        if resampling_way == ResamplingWay.UP_SAMPLING:
            func_id = "%s_resampled_to_%sms_%s" % (
                fid_origin, str(resampling_period), str(adding_method))
        else:
            func_id = "%s_resampled_to_%sms_%s_%s" % (
                fid_origin, str(resampling_period), timestamp_position,
                aggregation_method)

        # Creating new reference in database for new timeseries
        IkatsApi.ts.create_ref(func_id)

        # Prepare data to compute by defining intervals of final size nb_points_by_chunk
        # Chunk intervals computation :

        # Computing elementary size which is the lowest common multiple between ref period and resampling period
        elementary_size = _lowest_common_multiple(ref_period,
                                                  resampling_period)

        # Seeking the number of elementary size which contains nb of points nearest to nb_points_by_chunk parameter
        # in order to compute the final data chunk size
        nb_points_for_elementary_size = int(elementary_size /
                                            resampling_period)
        data_chunk_size = int(nb_points_by_chunk /
                              nb_points_for_elementary_size) * elementary_size

        # Limit the size of data_chunk_size
        if data_chunk_size < elementary_size:
            data_chunk_size = elementary_size

        # Computing intervals for chunk definition
        interval_limits = np.hstack((np.arange(sd,
                                               ed,
                                               data_chunk_size,
                                               dtype=np.int64), ed))

        # from intervals we define chunk of data to compute
        # ex : intervals = [ 1, 2, 3] => 2 chunks [1, 2] and [2, 3]
        if len(interval_limits) > 2:
            # there is more than 2 limits for interval definition, i.e there is more than one chunk to compute
            data_to_compute.extend([(tsuid, func_id, i, interval_limits[i],
                                     interval_limits[i + 1])
                                    for i in range(len(interval_limits) - 1)])
        elif len(interval_limits) > 1:
            # only one chunk to compute
            data_to_compute.append(
                (tsuid, func_id, 0, interval_limits[0], interval_limits[1]))

        # in case last original point and last downsampled point are aligned => add a supplementary chunk to compute
        # last point
        if (interval_limits[-1] - sd) % resampling_period == 0:
            data_to_compute.append((tsuid, func_id, 1, interval_limits[-1],
                                    interval_limits[-1] + resampling_period))

    LOGGER.info("Running resampling using Spark")
    # Create or get a spark Context
    spark_context = ScManager.get()

    if resampling_way == ResamplingWay.UP_SAMPLING:
        spark_function = _spark_upsample
        args = adding_method
    else:
        spark_function = _spark_downsample
        args = (timestamp_position, aggregation_method)

    try:

        # OUTPUT : [(TSUID_origin, func_id, chunk_index, sd_interval, ed_interval), ...]
        inputs = spark_context.parallelize(data_to_compute,
                                           len(data_to_compute))

        # INPUT :  [(TSUID_origin, func_id, chunk_index, sd_interval, ed_interval), ...]
        # OUTPUT : [((TSUID_origin, func_id), chunk_index, original_data_array), ...]
        # PROCESS : read original data in database / filter chunk with no data
        rdd_data_with_chunk_index = inputs \
            .map(lambda x: ((x[0], x[1]), x[2], IkatsApi.ts.read(tsuid_list=x[0], sd=int(x[3]), ed=int(x[4]))[0])) \
            .filter(lambda x: len(x[2]) > 0)

        if resampling_way == ResamplingWay.UP_SAMPLING:
            # INPUT :  [((TSUID_origin, func_id), chunk_index, original_data_array), ...]
            # OUTPUT : [((TSUID_origin, func_id), original_data_array_with_inter_chunks), ...]
            # PROCESS : compute inter-chunks intervals / filter empty chunks
            rdd_data = _calc_inter_chunks(rdd=rdd_data_with_chunk_index) \
                .map(lambda x: (x[0], x[2])) \
                .filter(lambda x: not (len(x[1]) == 2 and (int(float(x[1][0][0])) == int(float(x[1][1][0])))))
        else:
            # INPUT :  [((TSUID_origin, func_id), chunk_index, original_data_array), ...]
            # OUTPUT : [((TSUID_origin, func_id), original_data_array), ...]
            # PROCESS : suppress useless chunk indexes
            rdd_data = rdd_data_with_chunk_index.map(lambda x: (x[0], x[2]))

        # INPUT :  [((TSUID_origin, func_id), original_data_array_with_inter_chunks), ...]
        # OUTPUT : [((TSUID_origin, func_id), data_resampled_array), ...]
        # PROCESS : resample chunks of data to resampling_period
        rdd_resampled_data = rdd_data.map(
            lambda x: (x[0], spark_function(data=x[1], period=resampling_period, args=args))) \
            .filter(lambda x: len(x[1]) > 0)

        # INPUT :  [((TSUID_origin, func_id), data_resampled_array), ...]
        # OUTPUT : [(TSUID_origin, func_id, TSUID, sd, ed), ...]
        # PROCESS : create resampled data in database / compute global start and end date
        identifiers = rdd_resampled_data \
            .map(lambda x: (x[0][0], x[0][1], _spark_import(fid=x[0][1],
                                                            data=x[1],
                                                            generate_metadata=generate_metadata))) \
            .map(lambda x: ((x[0], x[1], x[2][0]), (x[2][1], x[2][2]))) \
            .reduceByKey(lambda x, y: (min(x[0], y[0]), max(x[1], y[1]))) \
            .map(lambda x: (x[0][0], x[0][1], x[0][2], x[1][0], x[1][1])) \
            .collect()

    except Exception as err:
        msg = "Exception raised while resampling with Spark: %s " % err
        LOGGER.error(msg)
        raise IkatsException(msg)

    finally:
        # Stop spark Context
        ScManager.stop(
        )  # Post-processing : metadata import and return dict building

    # returns dict containing the results of the resampling
    # where key is the original TSUID and values are resampled TSUID and functional identifiers
    returned_dict = {}
    for timeseries in identifiers:
        tsuid_origin = timeseries[0]
        func_id = timeseries[1]
        tsuid = timeseries[2]
        sd = timeseries[3]
        ed = timeseries[4]

        # Import metadata in non temporal database
        _save_metadata(tsuid=tsuid,
                       md_name='qual_ref_period',
                       md_value=resampling_period,
                       data_type=DTYPE.number,
                       force_update=True)
        _save_metadata(tsuid=tsuid,
                       md_name='ikats_start_date',
                       md_value=sd,
                       data_type=DTYPE.date,
                       force_update=True)
        _save_metadata(tsuid=tsuid,
                       md_name='ikats_end_date',
                       md_value=ed,
                       data_type=DTYPE.date,
                       force_update=True)

        # Retrieve imported number of points from database
        qual_nb_points = IkatsApi.ts.nb_points(tsuid=tsuid)
        IkatsApi.md.create(tsuid=tsuid,
                           name='qual_nb_points',
                           value=qual_nb_points,
                           data_type=DTYPE.number,
                           force_update=True)

        # Inherit from parent
        IkatsApi.ts.inherit(tsuid, tsuid_origin)

        # Fill returned list
        returned_dict[tsuid_origin] = {"tsuid": tsuid, 'funcId': func_id}

    return returned_dict
예제 #18
0
    def _apply_motif_global_same_words(self, activate_spark):
        """
        Test
        - with the global method to search the neighborhood motif,
        - with/without spark jobs according to activate_spark
        - and where the words are all the same
        """
        spark_context = ScManager.get()
        # Build the SAX result with large breakpoints
        sax_result = SaxResult(paa=spark_context.parallelize([]),
                               breakpoints=[-300, -100, 100, 300],
                               sax_word='abcdeabcdeabcdeabcde')
        sax, _, _ = sax_result.start_sax(5, spark_ctx=spark_context)
        # sax is an rdd -> to np.array
        sax = np.transpose(sax.collect())

        breakpoint = sax_result.build_mindist_lookup_table(alphabet_size=5)

        # Build the collision matrix result
        collision_matrix = SparseMatrix(
            np.array([[
                0,
                0,
                0,
                0,
            ], [
                100,
                0,
                0,
                0,
            ], [
                100,
                100,
                0,
                0,
            ], [
                100,
                100,
                100,
                0,
            ]]))

        # two identical cases here: brute force / with collisions
        for method_opt in [OPT_USING_BRUTE_FORCE, OPT_USING_COLLISIONS]:
            #  mindist distances:
            #
            # [[ 0.  0.  0.  0.]
            #  [ 0.  0.  0.  0.]
            #  [ 0.  0.  0.  0.]
            #  [ 0.  0.  0.  0.]]

            # Build the class for motif search
            search_info = NeighborhoodSearch(size_sequence=20,
                                             mindist_lookup_table=breakpoint,
                                             alphabet_size=5,
                                             sax=np.transpose(sax),
                                             radius=0.01,
                                             collision_matrix=collision_matrix)

            recognition_info = ConfigRecognition(
                is_stopped_by_eq9=True,
                iterations=0,
                min_value=1,
                is_algo_method_global=True,
                activate_spark=activate_spark,
                radius=0.01,
                neighborhood_method=method_opt)

            # neighborhood_method=OPT_USING_BRUTE_FORCE (compare with all the words)
            result = search_info.motif_neighborhood_global(
                30, recognition_info)

            self._print_mindist_mat(search_info)

            # The words corresponding to the six largest values cells have a MINDIST < radius
            self.assertEqual(len(result), 1)
            # This results are the same : [0,1,2,3]: the 6 groups have been reduced to one inside
            self.assertEqual(result, [[0, 1, 2, 3]])
예제 #19
0
파일: ds_cut.py 프로젝트: IKATS/op-ts_cut
def dataset_cut_spark(tsuid_list, start, end, nb_points, nb_points_by_chunk, generate_metadata, meta_list):
    """
    Cutting dataset algorithm, using spark

    :param tsuid_list: list of tsuid
    :param start: start cut date
    :param end: end cut date
    :param nb_points: number of points to cut
    :param nb_points_by_chunk: number of points per chunk
    :param generate_metadata: True to generate metadata on-the-fly (ikats_start_date, ikats_end_date, qual_nb_points)
                              (default: False)
    :param meta_list: dict of metadata (tsuid is the key)

    :type tsuid_list: list
    :type start: int
    :type end: int or None
    :type nb_points: int or None
    :type generate_metadata: boolean
    :param meta_list: dict

    :return: list of dict {"tsuid": tsuid, "funcId": func_id}
    :rtype: list of dict

    :raise ValueError: if inputs are not filled properly (see called methods description)
    """

    # List of chunks of data and associated information to parallelize with Spark
    data_to_compute = []

    # Collecting information from metadata
    for tsuid in tsuid_list:
        if tsuid not in meta_list:
            LOGGER.error("Time series %s: no metadata found in base", tsuid)
            raise ValueError("No ikats metadata available for cutting %s" % tsuid)
        if 'ikats_start_date' not in meta_list[tsuid]:
            # Metadata not found
            LOGGER.error("Metadata 'ikats_start_date' for time series %s not found in base", tsuid)
            raise ValueError("No start date available for cutting [%s]" % tsuid)
        if 'ikats_end_date' not in meta_list[tsuid]:
            # Metadata not found
            LOGGER.error("Metadata 'ikats_end_date' for time series %s not found in base", tsuid)
            raise ValueError("No end date available for cutting [%s]" % tsuid)
        if 'qual_ref_period' not in meta_list[tsuid]:
            # Metadata not found
            LOGGER.error("Metadata 'qual_ref_period' for time series %s not found in base", tsuid)
            raise ValueError("No reference period available for cutting [%s]" % tsuid)

        # Original time series information retrieved from metadata
        sd = int(meta_list[tsuid]['ikats_start_date'])
        ed = int(meta_list[tsuid]['ikats_end_date'])
        ref_period = int(float(meta_list[tsuid]['qual_ref_period']))

        # Get the functional identifier of the original time series
        fid_origin = IkatsApi.ts.fid(tsuid)

        # Generate functional id for resulting time series
        func_id = "%s_cut_%d" % (fid_origin, time.time() * 1e6)

        # Creating new reference in database for new time series
        IkatsApi.ts.create_ref(func_id)

        # Prepare data to compute by defining intervals of final size nb_points_by_chunk
        # Chunk intervals computation:

        data_chunk_size = int(nb_points_by_chunk * ref_period)

        # Computing intervals for chunk definition
        interval_limits = np.hstack(np.arange(sd, ed, data_chunk_size, dtype=np.int64))

        # from intervals we define chunk of data to compute:
        #
        # 1. defining chunks excluding last point of data within every chunk
        # ex: intervals = [ 10, 20, 30, 40 ] => 2 chunks [10, 19] and [20, 29] (last chunk added in step 2)
        data_to_compute.extend([(tsuid,
                                 func_id,
                                 i,
                                 interval_limits[i],
                                 interval_limits[i + 1] - 1) for i in range(len(interval_limits) - 1)])
        # 2. adding last interval, including last point of data
        # ex: [30, 40]
        data_to_compute.append((tsuid,
                                func_id,
                                len(interval_limits) - 1,
                                interval_limits[-1],
                                ed + 1))

    LOGGER.info("Running dataset cut using Spark")
    # Create or get a spark Context
    spark_context = ScManager.get()

    try:

        # OUTPUT: [(TSUID_origin, func_id, chunk_index, sd_interval, ed_interval), ...]
        inputs = spark_context.parallelize(data_to_compute, len(data_to_compute))

        # INPUT:  [(TSUID_origin, func_id, chunk_index, sd_interval, ed_interval), ...]
        # OUTPUT: [((TSUID_origin, func_id), chunk_index, original_data_array), ...]
        # PROCESS: read original data in database / filter chunk with no data
        rdd_data = inputs \
            .map(lambda x: ((x[0], x[1]), x[2], IkatsApi.ts.read(tsuid_list=x[0], sd=int(x[3]), ed=int(x[4]))[0])) \
            .filter(lambda x: len(x[2]) > 0)

        # INPUT:  [((TSUID_origin, func_id), chunk_index, original_data_array), ...]
        # OUTPUT: [((TSUID_origin, func_id), chunk_index, (nb_points, data_cut_array)), ...]
        # PROCESS: cut chunks of data, filter empty results
        rdd_cut_chunk_data = rdd_data \
            .map(lambda x: (x[0], x[1], _spark_cut(data=x[2], min_date=start, max_date=end))) \
            .filter(lambda x: len(x[2][1]) > 0) \
            .cache()

        # no end cutting date provided => case of cutting a given number of points
        if end is None:

            # INPUT: [((TSUID_origin, func_id), chunk_index, (nb_points, data_cut_array)), ...]
            # OUTPUT: [((TSUID_origin, func_id), [(chunk_index1, nb_points1), (chunk_index2, nb_points2),...], ...]
            # PROCESS: Collect nb points associated to chunk indexes
            ts_pts_by_chunk = rdd_cut_chunk_data.map(lambda x: (x[0], (x[1], x[2][0]))) \
                .groupByKey().map(lambda x: (x[0], list(x[1]))) \
                .collect()

            # Compute for each ts from collected data:
            #   - last chunk index containing points to keep
            #   - the number of points to keep in this last chunk
            # cut_info: {(TSUID_origin1, func_id1):(last_chunk_index1, nb_points1),
            #             (TSUID_origin2, func_id2):(last_chunk_index2, nb_points2), ...}
            cut_info = {}
            for ts in ts_pts_by_chunk:
                nb_cumul = 0
                for chunk_index, points in ts[1]:
                    nb_cumul += points
                    # noinspection PyTypeChecker
                    if nb_cumul > nb_points:
                        # noinspection PyTypeChecker
                        cut_info[ts[0]] = (chunk_index, points - (nb_cumul - nb_points))
                        break
                else:
                    LOGGER.warning(
                        "Number of points cut with start cutting date provided exceeds time series %s size"
                        % IkatsApi.ts.fid(ts[0][0]))
                    # case nb_points > nb points of the time series
                    # noinspection PyTypeChecker
                    cut_info[ts[0]] = (chunk_index, points)

            # INPUT: [((TSUID_origin, func_id), chunk_index, (nb_points, data_cut_array)), ...]
            # OUTPUT: [((TSUID_origin, func_id), data_cut_array), ...]
            rdd_cut_data = rdd_cut_chunk_data.filter(lambda x: x[1] <= cut_info[x[0]][0]) \
                .map(lambda x: (x[0], x[2][1][:cut_info[x[0]][1]] if x[1] == cut_info[x[0]][0] else x[2][1]))

        else:
            # INPUT: [((TSUID_origin, func_id), chunk_index, (nb_points, data_cut_array)), ...]
            # OUTPUT: [((TSUID_origin, func_id), data_cut_array), ...]
            rdd_cut_data = rdd_cut_chunk_data.map(lambda x: (x[0], x[2][1]))

        # INPUT:  [((TSUID_origin, func_id), data_cut_array), ...]
        # OUTPUT: [(TSUID_origin, func_id, TSUID, sd, ed), ...]
        # PROCESS: create cut data in database / compute global start and end date
        identifiers = rdd_cut_data \
            .map(lambda x: (x[0][0], x[0][1], _spark_import(fid=x[0][1],
                                                            data=x[1],
                                                            generate_metadata=generate_metadata))) \
            .map(lambda x: ((x[0], x[1], x[2][0]), (x[2][1], x[2][2]))) \
            .reduceByKey(lambda x, y: (min(x[0], y[0]), max(x[1], y[1]))) \
            .map(lambda x: (x[0][0], x[0][1], x[0][2], x[1][0], x[1][1])) \
            .collect()

    except Exception as err:
        msg = "Exception raised while cutting with Spark: %s " % err
        LOGGER.error(msg)
        raise IkatsException(msg)

    finally:
        # Stop spark Context
        ScManager.stop()  # Post-processing: metadata import and return dict building

    # Returns list of dict containing the results of the cut time series: TSUID and functional identifiers
    results = []
    for timeseries in identifiers:
        tsuid_origin = timeseries[0]
        func_id = timeseries[1]
        tsuid = timeseries[2]
        sd = timeseries[3]
        ed = timeseries[4]

        # Import metadata in non temporal database
        _save_metadata(tsuid=tsuid, md_name='ikats_start_date', md_value=sd, data_type=DTYPE.date, force_update=True)
        _save_metadata(tsuid=tsuid, md_name='ikats_end_date', md_value=ed, data_type=DTYPE.date, force_update=True)

        # Retrieve imported number of points from database
        qual_nb_points = IkatsApi.ts.nb_points(tsuid=tsuid)
        IkatsApi.md.create(tsuid=tsuid, name='qual_nb_points', value=qual_nb_points, data_type=DTYPE.number,
                           force_update=True)

        # Inherit from parent
        IkatsApi.ts.inherit(tsuid, tsuid_origin)

        # Fill returned list
        results.append({"tsuid": tsuid, "funcId": func_id})

    return results
예제 #20
0
def cut_y(original_ts_list, criterion, fid_pattern="{fid}_cutY{compl}", chunk_size=75000):
    """
    Algorithm Cut-Y

    Cut among Y-axis (values) a list of timeseries matching a criterion defined as a python expression.
    Matching and non-matching values are separated into 2 timeseries

    This algorithm uses spark

    From the TS list provided (used as reference), extract 2 TS list:
    * The first one matching the value condition
    * The second one not matching the value condition

    :param original_ts_list: List of TSUID/funcID to use for filtering: [{tsuid:xxx, funcId:xxx}, ...]
    :param criterion: python expression used to define a matching pattern
    :param fid_pattern: pattern used to name the FID of the output TSUID.
           {fid} will be replaced by the FID of the original TSUID FID
           {M} will be replaced by the original TSUID metric name
           {compl} will be replaced by "" or "_compl" depending on the output type (matching/not matching).
    :param chunk_size: the number of points per chunk

    :type original_ts_list: list
    :type criterion: str
    :type fid_pattern: str
    :type chunk_size: int

    :return: 2 lists representing the "matching" and "not matching" list of TS corresponding to the input
    :rtype: list

    :raises ValueError: if ts_list is badly formatted
    :raises TypeError: if ts_list is not a list
    """

    # Check input validity
    if type(original_ts_list) is not list:
        raise TypeError("ts_list shall be a list")
    if len(original_ts_list) == 0:
        raise ValueError("ts_list shall have at least one element")
    for _, item in enumerate(original_ts_list):
        if "tsuid" not in item or "funcId" not in item:
            raise ValueError("ts_list shall have tsuid and funcId defined")

    # Get all the metadata
    md_list = IkatsApi.md.read(ts_list=[x['tsuid'] for x in original_ts_list])

    # Prepare the spark items to parallelize

    # Create and build the data that will be used in spark transformations
    ts_list_with_new_fid, fid2tsuid = _prepare_spark_data(fid_pattern=fid_pattern,
                                                          md_list=md_list,
                                                          ts_list=original_ts_list)
    # Chunks computation
    ts_info = []
    for ts_data in ts_list_with_new_fid:

        # Get the chunks raw information
        chunks = SparkUtils.get_chunks(tsuid=ts_data[0], md_list=md_list, chunk_size=chunk_size)

        # Build a new list containing only used information
        for chunk in chunks:
            ts_info.append({
                "tsuid": ts_data[0],
                "start_date": chunk[1],
                "end_date": chunk[2],
                "matching_fid": ts_data[1],
                "not_matching_fid": ts_data[2],
                "matching_tsuid": fid2tsuid[ts_data[1]],
                "not_matching_tsuid": fid2tsuid[ts_data[2]]
            })

    # Get Spark Context
    # Important !!!! Use only this method in Ikats to use a spark context
    spark_context = ScManager.get()
    try:

        # Prepare the lambda expression. Value is replaced by "Y" variable name
        lambda_criterion = eval("lambda Y : " + criterion)

        # OUTPUT : [{
        #   tsuid:x,
        #   start_date:x,
        #   end_date:x,
        #   matching_fid:x,
        #   not_matching_fid:x,
        #   matching_tsuid:x,
        #   not_matching_tsuid:x
        # }, ...]
        # PROCESS : Parallelize TS chunks information
        rdd_ts_list = spark_context.parallelize(ts_info, max(8, len(ts_info)))

        # INPUT :  [{
        #   tsuid:x,
        #   start_date:x,
        #   end_date:x,
        #   matching_fid:x,
        #   not_matching_fid:x,
        #   matching_tsuid:x,
        #   not_matching_tsuid:x
        # }, ...]
        # OUTPUT : [({
        #  start_date: "date of the first point matching the criterion in the current chunk"
        #  end_date: "date of the last point matching the criterion in the current chunk"
        #  numberOfSuccess: "number of points matching the criterion in the current chunk"
        #  tsuid: "TSUID of the matching part"
        # },
        # {
        #  start_date: "date of the first point not matching the criterion in the current chunk"
        #  end_date: "date of the last point not matching the criterion in the current chunk"
        #  numberOfSuccess: "number of points not matching the criterion in the current chunk"
        #  tsuid: "TSUID of the non-matching part"
        # }), ...]
        # PROCESS : Separate points matching and not-matching the criterion in every chunk. Fill the corresponding TS
        rdd_imported = rdd_ts_list.map(lambda x: _spark_cut_y_chunk(
            tsuid=x['tsuid'],
            start_date=x['start_date'],
            end_date=x['end_date'],
            match_criterion=lambda_criterion,
            result_info={
                "matching_fid": x['matching_fid'],
                "not_matching_fid": x['not_matching_fid'],
                "matching_tsuid": x['matching_tsuid'],
                "not_matching_tsuid": x['not_matching_tsuid']
            }))

        # INPUT : [({
        #  start_date: "date of the first point matching the criterion in the current chunk"
        #  end_date: "date of the last point matching the criterion in the current chunk"
        #  numberOfSuccess: "number of points matching the criterion in the current chunk"
        #  tsuid: "TSUID of the matching part"
        # },
        # {
        #  start_date: "date of the first point not matching the criterion in the current chunk"
        #  end_date: "date of the last point not matching the criterion in the current chunk"
        #  numberOfSuccess: "number of points not matching the criterion in the current chunk"
        #  tsuid: "TSUID of the non-matching part"
        # }), ...]
        # OUTPUT : [(TSUID, nb_points, start_date, end_date), ...]
        # PROCESS : Flat the results and simplify the format to allow quick actions on every item
        rdd_metadata_prep = rdd_imported \
            .flatMap(lambda x: x) \
            .filter(lambda x: x is not None) \
            .map(lambda x: (x['tsuid'], x['numberOfSuccess'], x['start_date'], x['end_date']))

        # Delete empty TSUID
        deleted_tsuid = rdd_metadata_prep \
            .map(lambda x: (x[0], x[1])) \
            .reduceByKey(lambda x, y: x + y) \
            .filter(lambda x: x[1] == 0) \
            .map(lambda x: (x[0], IkatsApi.ts.delete(tsuid=x[0]))) \
            .map(lambda x: x[0]) \
            .collect()

        # This RDD is reused in several branches. Caching it improves the performances
        rdd_metadata_prep.cache()

        # Create metadata qual_nb_points
        rdd_metadata_prep \
            .map(lambda x: (x[0], x[1])) \
            .reduceByKey(lambda x, y: x + y) \
            .filter(lambda x: x[1] > 0) \
            .foreach(lambda x: IkatsApi.md.create(tsuid=x[0], name="qual_nb_points", value=x[1]))

        # Create metadata ikats_start_date
        rdd_metadata_prep \
            .map(lambda x: (x[0], x[2])) \
            .filter(lambda x: x[1] is not None) \
            .reduceByKey(lambda x, y: min(x, y)) \
            .foreach(lambda x: IkatsApi.md.create(tsuid=x[0], name="ikats_start_date", value=x[1]))

        # Create metadata ikats_end_date
        rdd_metadata_prep \
            .map(lambda x: (x[0], x[3])) \
            .filter(lambda x: x[1] is not None) \
            .reduceByKey(lambda x, y: max(x, y)) \
            .foreach(lambda x: IkatsApi.md.create(tsuid=x[0], name="ikats_end_date", value=x[1]))

        # Unpersist the RDD because not used anymore
        rdd_metadata_prep.unpersist()

    finally:
        ScManager.stop()

    # Inherit properties
    for item in ts_list_with_new_fid:
        if fid2tsuid[item[1]] not in deleted_tsuid:
            IkatsApi.ts.inherit(tsuid=fid2tsuid[item[1]], parent=item[0])
        if fid2tsuid[item[2]] not in deleted_tsuid:
            IkatsApi.ts.inherit(tsuid=fid2tsuid[item[2]], parent=item[0])

    # Format and sort the results
    # First output contains the matched data points TS reference
    # Second output contains the not matched (complement) points TS reference
    return (_format_output(deleted_tsuid=deleted_tsuid,
                           fid2tsuid=fid2tsuid,
                           ts_list_with_new_fid=ts_list_with_new_fid,
                           index=1),
            _format_output(deleted_tsuid=deleted_tsuid,
                           fid2tsuid=fid2tsuid,
                           ts_list_with_new_fid=ts_list_with_new_fid,
                           index=2))
예제 #21
0
    def _apply_motif_global_coll_ex1(self, activate_spark):
        """
        Test
          - with the global method to search the neighborhood motif,
          - with/without spark according to activate_spark
          - exploring similarities with collisions heuristic
          - with input: the words have only one different letter.  And every sequence
            Si has collisions with Sj with that matrix.

         Note: results ought to be equal to test_global_brute_no_spark_ex1
        """

        # Build the SAX result where the words have only one different letter (words: 5 letters)
        sequences = ["abcde", "abcdd", "abcdc", "abcdb", "abcda"]
        tested_sax_word = ''.join(sequences)
        spark_context = ScManager.get()
        sax_result = SaxResult(paa=spark_context.parallelize([]),
                               breakpoints=[-1.1, -1, 0, 1.501],
                               sax_word=tested_sax_word)
        sax, _, nb_seq = sax_result.start_sax(5, spark_ctx=spark_context)
        # sax is an rdd -> to np.array
        sax = np.transpose(sax.collect())

        breakpoint = sax_result.build_mindist_lookup_table(5)

        # Build a collision matrix (the real collision matrix is different, but we take this one for the test)
        collision_matrix = SparseMatrix(
            np.array([[
                0,
                0,
                0,
                0,
                0,
            ], [
                30,
                0,
                0,
                0,
                0,
            ], [
                2,
                40,
                0,
                0,
                0,
            ], [
                4,
                8,
                50,
                0,
                0,
            ], [
                6,
                10,
                20,
                60,
                0,
            ]]))

        self._print_matrix("test_global_coll_no_spark_ex1",
                           collision_matrix.data, nb_seq)

        # mindist distances:
        # [[ 0.     0.     3.002  5.002  5.202]
        #  [ 0.     0.     0.     2.     2.2  ]
        #  [ 3.002  0.     0.     0.     0.2  ]
        #  [ 5.002  2.     0.     0.     0.   ]
        #  [ 5.202  2.2    0.2    0.     0.   ]]

        # Using neighborhood_method=OPT_USING_COLLISIONS
        #
        #  for collisions (0,1) (1,2) (2,3) (3,4) greater than min_value==25
        #  and with the collisions heuristic: only sequences having collisions with Si or Sj are examined
        #
        # for radius 1.9  => global result is [[0, 1, 2], [0, 1, 2, 3, 4], [1, 2, 3, 4], [2, 3, 4]]
        #
        # for radius 2.5  => global result is [[0, 1, 2, 3, 4], [0, 1, 2, 3, 4], [1, 2, 3, 4], [1, 2, 3, 4]]
        #                                      => reduced to [[[0, 1, 2, 3, 4], [1, 2, 3, 4]]
        #
        # for radius 3.5  => global result is [[0, 1, 2, 3, 4], [0, 1, 2, 3, 4], [0, 1, 2, 3, 4], [1, 2, 3, 4]]
        #                                      => reduced to [[0, 1, 2, 3, 4], [1, 2, 3, 4]]
        #
        # for radius 6    => global result is [[0, 1, 2, 3, 4], [0, 1, 2, 3, 4], [0, 1, 2, 3, 4], [0, 1, 2, 3, 4]]
        #                                      => reduced to [[0, 1, 2, 3, 4]]
        #
        for radius, expected_res in [[2.5, [[0, 1, 2, 3, 4], [1, 2, 3, 4]]],
                                     [
                                         1.9,
                                         [[0, 1, 2], [0, 1, 2, 3, 4],
                                          [1, 2, 3, 4], [2, 3, 4]]
                                     ], [3.5, [[0, 1, 2, 3, 4], [1, 2, 3, 4]]],
                                     [6, [[0, 1, 2, 3, 4]]]]:

            # Build the class for motif search where the min_value is 25
            search_info = NeighborhoodSearch(size_sequence=20,
                                             mindist_lookup_table=breakpoint,
                                             alphabet_size=5,
                                             sax=np.transpose(sax),
                                             radius=radius,
                                             collision_matrix=collision_matrix)

            # for info: here is the mindist:
            #  (see _print_mindist_mat doc: in order to activate print)
            self._print_mindist_mat(search_info)

            recognition_info = ConfigRecognition(
                is_stopped_by_eq9=True,
                iterations=0,
                min_value=25,
                is_algo_method_global=True,
                activate_spark=activate_spark,
                radius=radius,
                neighborhood_method=OPT_USING_COLLISIONS)

            print("radius {}:expected:                 {}".format(
                radius, expected_res))
            result = search_info.motif_neighborhood_global(
                recognition_info.min_value, recognition_info)

            print("radius {}:->global with collisions: {}".format(
                radius, result))

            self.assertEqual(len(result), len(expected_res))
            for group in result:
                self.assertTrue(group in expected_res)
예제 #22
0
    def run(self, tsuids):
        """
        Run the Spark Distance calculation

        create the RDD for each tsuid,
        load the TS from tdm in a broadcast dictionary (ie shared by all workers)
        map the RDD with cartesian product ( ie get RDD1,RDD1 RDD1,RDD2 RDD2,RDD1 RDD2,RDD2 with 2 RDD)
        to get the comparison couples.
        then reduce applying distance function and add the result into Accumulator (ie shared by all workers)
        distance function take the two TS from broadcast dictionary, shrink the biggest and apply euclidean

        Usage: pi [tsuid1] [tsuid2] ...[tsuidn]

        example : tsuids = ['0000110000030003F30000040003F1',
                            '0000110000030003F40000040003F1',
                            '0000110000030003F50000040003F1',
                            '0000110000030003F60000040003F1',
                            '0000110000030003F70000040003F1']

        :param tsuids: a list of tsuids (str)
        :type tsuids: list

        """

        # creation of the RDD
        rdd = self.spark_context.parallelize(tsuids)

        self.logger.info("rdd parallelized")
        self.logger.info("loading TS")
        start_time = time.time()

        j = len(tsuids) // self.ts_load_split_size
        self.logger.debug(type(tsuids))
        self.logger.info("Number of TS: %i ", len(tsuids))
        ts = list()
        for i in range(0, j + 1):
            k = (i + 1) * self.ts_load_split_size
            if k > len(tsuids):
                k = len(tsuids)
            self.logger.info("extract TS from index %i to %i ",
                             i * self.ts_load_split_size, k)
            ts.extend(self.tdm.get_ts(tsuids[i * self.ts_load_split_size:k]))

        ts_dic = dict()
        self.logger.info("Number of TS loaded : %i ", len(ts))
        for index in range(0, len(tsuids)):
            ts_dic[tsuids[index]] = ts[index]
        # broadcast var used to get the map result
        broadcast_var = self.spark_context.broadcast(ts_dic)
        loading_end_time = time.time()
        self.logger.info("Loading Time : %s ", loading_end_time - start_time)

        # create the result accumulator
        list_accum = self.spark_context.accumulator(dict(),
                                                    ListAccumulatorParam())

        def calculate_distance(tsuid_list):
            """
               :param tsuid_list: a pair of tsuids
               :type tsuid_list: list
            """
            # use py4j logger to avoid Serialization problems.
            logger = logging.getLogger('py4j')
            logger.setLevel(logging.INFO)
            logger.removeHandler(logger.handlers[0])
            # sh = logging.StreamHandler(sys.stdout)
            stream_handler = logging.StreamHandler()
            stream_handler.setLevel(logging.INFO)
            formatter = logging.Formatter(
                '%(asctime)s:%(levelname)s:%(funcName)s:%(message)s')
            stream_handler.setFormatter(formatter)
            logger.addHandler(stream_handler)

            # start distance calculus.
            logger.debug("tsuid1= %s", tsuid_list[0])
            logger.debug("tsuid2= %s", tsuid_list[1])
            if tsuid_list[0] != tsuid_list[1]:
                first_ts = np.array(broadcast_var.value[tsuid_list[0]][:, 1])
                second_ts = np.array(broadcast_var.value[tsuid_list[1]][:, 1])
                calculus_len = min(len(first_ts), len(second_ts))

                distance = euclidean(first_ts[0:calculus_len],
                                     second_ts[0:calculus_len])

                # logger.debug("tsuid list %s and distance %f" % (tsuid_list, distance))
                list_accum.add({tsuid_list: distance})

        __import__('ikats.algo.core.distance')
        rdd.cartesian(rdd).foreach(calculate_distance)

        ScManager.stop()

        computation_end_time = time.time()
        self.logger.info("Loading Time : %s ", loading_end_time - start_time)
        self.logger.info("Compute Time : %s ",
                         computation_end_time - loading_end_time)
        return list_accum.value
예제 #23
0
    def _apply_iter_coll_no_spark_ex1(self, activate_spark):
        """
         Tests motif_neighborhood_iterative()
         - the iterative method
         - using the heuristic based upon collisions
         - to search the neighborhood motif

         Note: test where the words have only one different letter.
        """

        # Build the SAX result where the words have only one different letter (words: 5 letters)
        sequences = ["abcde", "abcdd", "abcdc", "abcdb", "abcda"]
        tested_sax_word = ''.join(sequences)
        spark_context = ScManager.get()
        sax_result = SaxResult(paa=spark_context.parallelize([]),
                               breakpoints=[-1.1, -1, 0, 1.501],
                               sax_word=tested_sax_word)
        sax, _, nb_seq = sax_result.start_sax(5, spark_ctx=spark_context)
        # sax is an rdd -> to np.array
        sax = np.transpose(sax.collect())

        breakpoint = sax_result.build_mindist_lookup_table(5)

        # Build a collision matrix
        # Note: this matrix is different from the one from
        #   test test_iterative__brute_no_spark_ex1:
        #    => see zeros are added: coll(3,2) == coll(4,2) == 0
        collision_matrix = SparseMatrix(
            np.array([[
                0,
                0,
                0,
                0,
                0,
            ], [
                40,
                0,
                0,
                0,
                0,
            ], [
                2,
                40,
                0,
                0,
                0,
            ], [
                4,
                8,
                0,
                0,
                0,
            ], [
                6,
                10,
                0,
                50,
                0,
            ]]))

        self._print_matrix("test_iterative__brute_no_spark_ex1",
                           collision_matrix.data, nb_seq)

        # mindist distances:
        # [[ 0.     0.     3.002  5.002  5.202]
        #  [ 0.     0.     0.     2.     2.2  ]
        #  [ 3.002  0.     0.     0.     0.2  ]
        #  [ 5.002  2.     0.     0.     0.   ]
        #  [ 5.202  2.2    0.2    0.     0.   ]]

        # Using neighborhood_method=OPT_USING_BRUTE_FORCE
        #
        # iterative:  examining collisions (i,j) per iteration:
        #             (3,4) then (1,2) +(0,1)
        #
        #             (collisions greater than min_value==25)
        #
        # Test with fixed radius 1.9:
        #    - iter=1    => result is [[3, 4]] considering (S3,S4) neighborhood
        #    - iter=2    => result extended with [0,1,2] considering (S0,S1), unchanged for (S1,S2)
        #    - iter=3    => result is the same than for iter=2: no more collision available
        #    - iter=100  => result is the same than for iter=2: no more collision available
        #
        for radius, nb_iter, expected_res in [[1.9, 1, [[3, 4]]],
                                              [1.9, 2, [[3, 4], [0, 1, 2]]],
                                              [1.9, 3, [[3, 4], [0, 1, 2]]],
                                              [1.9, 100, [[3, 4], [0, 1, 2]]]]:

            # Build the class for motif search where the min_value is 25
            search_info = NeighborhoodSearch(size_sequence=20,
                                             mindist_lookup_table=breakpoint,
                                             alphabet_size=5,
                                             sax=np.transpose(sax),
                                             radius=radius,
                                             collision_matrix=collision_matrix)

            # for info: here is the mindist:
            #  (see _print_mindist_mat doc: in order to activate print)
            self._print_mindist_mat(search_info)

            recognition_info = ConfigRecognition(
                is_stopped_by_eq9=True,
                iterations=nb_iter,
                min_value=25,
                is_algo_method_global=False,
                activate_spark=activate_spark,
                radius=radius,
                neighborhood_method=OPT_USING_COLLISIONS)

            result = search_info.motif_neighborhood_iterative(
                recognition_info.min_value, recognition_info)

            self.assertEqual(len(result), len(expected_res))
            for group in result:
                self.assertTrue(group in expected_res)
예제 #24
0
def main_test():
    """
    Functional test entry point
    """

    logger = logging.getLogger("ikats.algo.core.correlation")
    # Log format
    logger.setLevel(logging.DEBUG)
    formatter = logging.Formatter(
        '%(asctime)s:%(levelname)s:%(funcName)s:%(message)s')
    # Create another handler that will redirect log entries to STDOUT
    stream_handler = logging.StreamHandler()
    stream_handler.setLevel(logging.DEBUG)
    stream_handler.setFormatter(formatter)
    logger.addHandler(stream_handler)

    if os.getenv("PYSPARK_PYTHON") is None:
        os.putenv("PYSPARK_PYTHON",
                  "/home/ikats/tools/ikats_processing/bin/python")
    if os.getenv("SPARK_HOME") is None:
        os.putenv("SPARK_HOME", "/opt/spark")

    print('Loading Spark Context')
    # Get a spark Context
    ScManager.get()

    tdm = TemporalDataMgr()

    answer = 'n'
    tsuid_list = []
    ds_name = ''
    while answer.lower() != 'y':
        ds_name = input('\nEnter dataset Name: ')
        tsuid_list = tdm.get_data_set(ds_name)['ts_list']

        print("%s TS found in dataset %s" % (len(tsuid_list), ds_name))

        if len(tsuid_list) > 0:
            answer = input(
                "Run the correlation matrix on these dataset? [Y/n] ")

    print('Running correlation matrix on %s TS' % len(tsuid_list))

    start_time = time.time()
    sp_corr = SparkCorrelation(tdm)
    sp_corr.force_parallel_get_ts = True
    sp_corr.run(tsuid_list)

    print(
        "EXECUTION TIME (for %d TS with %d pts/ea = %d points): %.3f seconds" %
        (len(tsuid_list), sp_corr.ts_len_ref,
         (len(tsuid_list) * sp_corr.ts_len_ref), (time.time() - start_time)))

    if os.path.isfile('/tmp/spark_correlation_result_%s.csv' % ds_name):
        os.remove('/tmp/spark_correlation_result_%s.csv' % ds_name)
    with open('/tmp/spark_correlation_result_%s.csv' % ds_name,
              'w',
              newline='') as opened_file:
        opened_file.write(sp_corr.get_csv())

    print("Matrix in CSV format is saved at the following location:")
    print("   /tmp/spark_correlation_result_%s.csv" % ds_name)
    print("You can check the content by doing :")
    print("   cat /tmp/spark_correlation_result_%s.csv" % ds_name)
    print("   less /tmp/spark_correlation_result_%s.csv" % ds_name)
    print("   vi /tmp/spark_correlation_result_%s.csv" % ds_name)
예제 #25
0
def random_projections(ts_list, sax_info, collision_info, recognition_info):
    """
    The Random Projections Algorithm
    ================================

    This algorithm does the following (detailed for 1 TS but valid for many TS):
        * Apply the sliding window
        * Normalize the TS (global or/and local)
        * Filter the linear sequences (optional) and trivial matches
        * Apply the SAX algorithm
        * Build the collision matrix
        * Find the largest value cells in the collision matrix
        * Search the motif neighborhood

        ..note::
            The algorithm can produce "paa values" (numeric) for each sequence. The problem is the huge length of the
            results.

    **Catalogue implementation is provided**: main_random_projections() is calling random_projections() once all
    configurations ConfigSAX, ConfigCollision, ConfigRecognition are initialized.

    :param ts_list: list of TSUID
    :type ts_list: list

    :param sax_info: the information to make the sliding window and the sax_algorithm
    :type sax_info: ConfigSax

    :param collision_info: the information to build the collision matrix
    :type collision_info: ConfigCollision

    :param recognition_info: the information to made the pattern _recognition
    :type recognition_info: ConfigRecognition

    :return: the list of similar sequences, the sax result, the equation 9 result, and the sequences list
    :type: list, str, float, list
    """
    LOGGER.info("Configurations deduced from user parameters:")
    LOGGER.info("- sliding sax nb paa=%s", sax_info.paa)
    LOGGER.info("- sliding sax alphabet size=%s", sax_info.alphabet_size)
    LOGGER.info("- sliding sax sequences_size=%s", sax_info.sequences_size)
    LOGGER.info("- collision nb indexes=%s", collision_info.index)
    LOGGER.info("- collision nb iterations=%s", collision_info.nb_iterations)
    LOGGER.info("- collision accepted errors=%s", collision_info.errors)
    LOGGER.info("- recognition min_value=%s", recognition_info.min_value)
    LOGGER.info("- recognition iterations=%s", recognition_info.iterations)
    LOGGER.info("- recognition similarity radius=%s", recognition_info.radius)

    # Create or get a spark Context
    LOGGER.info("Running using Spark")
    spark_ctx = ScManager.get()

    # INPUT : all the TS { "ts_name" : [[time1, value1],...], "ts_name2": ... }
    # OUTPUT :  rdd_sequences_list = [ (key, sequence), ... ]
    # rdd_normalization_coefficients = [ (same_key,(un-normalized seq_mean, un-normalized seq_sd)), ...]
    # PROCESS : *sliding_windows* create sequences for each TS (results are RDDs)
    rdd_sequences_list, rdd_normalization_coefficients = sliding_windows(ts_list=ts_list,
                                                                         sax_info=sax_info,
                                                                         spark_ctx=spark_ctx,
                                                                         trivial_radius=recognition_info.radius / 2)
    # INPUT : rdd_sequences_list = [ (key, sequence), ... ]
    # OUTPUT : rdd_sax_result is a SaxResult object containing
    #  * paa (rdd of flatMap) : rdd of large list of all the paa_values concatenated
    #  * breakpoints (list) : list of the breakpoints (len = sax_info.alphabet_size - 1)
    #  * sax_word (large str): large string of all the SAX words concatenated
    # PROCESS : Give the SAX form of the sequences
    rdd_sax_result = run_sax_on_sequences(rdd_sequences_data=rdd_sequences_list,
                                          paa=sax_info.paa,
                                          alphabet_size=sax_info.alphabet_size)

    # INPUT : rdd_sequences_list = [ (key, sequence), ... ]
    # OUTPUT : sequences_list = { key: sequence, ...} NOT AN RDD!
    # PROCESS : transform rdd_sequences_list elements into dict
    sequences_list = rdd_sequences_list.collectAsMap()

    # INPUT : rdd_normalization_coefficients = [ (same_key,(un-normalized seq_mean, un-normalized seq_sd)), ...]
    # OUTPUT : sequences_list = { key: (un-normalized seq_mean, un-normalized seq_sd), ...} NOT AN RDD!
    # PROCESS : transform rdd_normalization_coefficients elements into dict
    normalization_coefficients = rdd_normalization_coefficients.collectAsMap()

    # Keep only necessary information of each sequence
    sequences_list = sequences_info(sequences_list, normalization_coefficients)

    # *paa_sequence* is a "conversion" of *sax* from letters to numbers (matrix with same shape)
    # (usefull for past-processing the random projection algorithm).
    breakpoints = [str(i) for i in rdd_sax_result.breakpoints]

    # Build the table which give the distance between two letters (need just sax_result.breakpoints)
    mindist_lookup_table = rdd_sax_result.build_mindist_lookup_table(sax_info.alphabet_size)

    # Give the SAX result in a array (need rdd_sax_result.sax_word and sax_result.paa)
    rdd_sax, paa_result, number_of_sequences = rdd_sax_result.start_sax(sax_info.paa, spark_ctx=spark_ctx)

    LOGGER.info("- filtered number of words=%s", number_of_sequences)

    if number_of_sequences == 1:
        LOGGER.info("- sliding window find just one sequence, no collision matrix computed.")
        collision_matrix = SparseMatrix(np.array([[0]]))
    else:

        # Build the collision matrix, the number of iteration can change
        # (if the len of a sequence is too small for example nb_iteration can be < nb_iteration specified)
        collision_matrix, collision_info.nb_iterations = final_collision_matrix(
            sax=rdd_sax,
            number_of_iterations=collision_info.nb_iterations,
            index_selected=collision_info.index,
            word_len=sax_info.paa,
            spark_ctx=spark_ctx)

    # *collision_matrix* is a sparse matrix : light in memory

    # Give the result of the Equation 9
    eq9_result = equation9(number_of_sequences=number_of_sequences,
                           size_alphabet=sax_info.alphabet_size,
                           size_word=sax_info.paa,
                           errors=collision_info.errors,
                           index_selected=collision_info.index,
                           iterations=collision_info.nb_iterations)

    sax = rdd_sax.collect()
    paa_result = np.transpose(paa_result)

    distance_info = NeighborhoodSearch(size_sequence=sax_info.sequences_size,
                                       mindist_lookup_table=mindist_lookup_table,
                                       alphabet_size=sax_info.alphabet_size,
                                       sax=sax,
                                       radius=recognition_info.radius,
                                       collision_matrix=collision_matrix)

    LOGGER.info("- theoretical Eq9 limit: min collisions = %s for accepted errors=%s", eq9_result,
                collision_info.errors)

    # Check the eq9_result with min_value
    if eq9_result < recognition_info.min_value:
        LOGGER.warning("- setting Eq9 limit to min_value=%s: because Eq9 < min_value", recognition_info.min_value)
        eq9_result = recognition_info.min_value
    if eq9_result < 1:
        LOGGER.warning("- setting Eq9 limit to 1: because Eq9 < 1")
        eq9_result = 1

    # find the motif neighborhood by using the largest value cells in the collision matrix
    if recognition_info.is_algo_method_global is True:
        algo_result = distance_info.motif_neighborhood_global(eq9_result, recognition_info)
    else:
        algo_result = distance_info.motif_neighborhood_iterative(eq9_result, recognition_info)

    # Give the results with the names of sequences and not their number in the collision matrix
    algo_result = result_on_sequences_form(algo_result, sequences_list, sax, sax_info.alphabet_size, paa_result)

    algo_result = result_on_pattern_form(algo_result)

    # Give the alphabet used in the SAX algorithm
    alphabet = start_alphabet(sax_info.alphabet_size)

    result = {'patterns': algo_result,
              'break_points': breakpoints,
              'disc_break_points': alphabet}

    if spark_ctx is not None:
        ScManager.stop()
        LOGGER.info("Ended Spark session.")

    return result
예제 #26
0
def calc_quality_stats(ts_list,
                       compute_value=True,
                       compute_time=True,
                       chunk_size=75000,
                       force_save=True):
    """
    Compute the quality statistics

    Returns a dict as follow
        {
            "TSUIDx" : {
                "MetadataX": ValueX,
                ...
            },
            ...
        }

    Don't override default chunk_size unless you know what you are doing.
    It defines the number of points in a single chunk (assuming th TS is periodic)
    Use it only for performances purposes

    :param ts_list: List of TSUID to work onto
    :type ts_list: list

    :param compute_value: boolean indicating to compute metadata related to value
    :type compute_value: bool

    :param compute_time: boolean indicating to compute metadata related to time
    :type compute_time: bool

    :param chunk_size: (Advanced usage) Override the chunk size
    :type chunk_size: int

    :param force_save: Save metadata even if already present (default True)
    :type force_save: bool

    :return: Tuple composed of the input ts list and a dict
             having TSUID as key and a value being sub-dict
             where key is metadata name
    :rtype: tuple dict
    """

    if not compute_value and not compute_time:
        LOGGER.error("You shall compute at least one set of metadata.")
        raise ValueError("You shall compute at least one set of metadata")

    try:
        # Convert tsuid_list [{tsuid:x, fid:x},...] to tsuid_list [tsuid,...]
        tsuid_list = [x['tsuid'] for x in ts_list]

    except TypeError:
        # Already a tsuid_list. No change
        tsuid_list = ts_list

    LOGGER.info('Computing Quality stats for %s TS', len(tsuid_list))

    # Get all metadata
    md_list = IkatsApi.md.read(ts_list=tsuid_list)

    # Initialize results
    results = {}
    for tsuid in tsuid_list:
        results[tsuid] = {}

    try:
        # Get Spark Context
        # Important !!!! Use only this method in Ikats to use a spark context
        spark_context = ScManager.get()

        results = {}
        for index, tsuid in enumerate(tsuid_list):

            LOGGER.info('Processing Quality stats for TS %s (%s/%s)', tsuid,
                        index, len(tsuid_list))

            # Generating information about TSUID chunks
            # ([chunk_index, sd, ed], ...)
            ts_info = []
            for chunk_index in range(
                    _ts_chunk_count(tsuid=tsuid,
                                    md_list=md_list,
                                    chunk_size=chunk_size)):
                ts_info.append(
                    _ts_chunk(tsuid=tsuid,
                              index=chunk_index,
                              md_list=md_list,
                              chunk_size=chunk_size))

            # Parallelizing information to work with spark
            # Each chunk can be computed separately, so divided into len(chunks) partitions
            rdd_ts_info = spark_context.parallelize(ts_info,
                                                    max(8, len(ts_info)))

            # RDD containing the list of points values for every chunk of a TSUID
            # (without timestamps):
            # ([chunk_index, [[timestamp, value], ...], ...)
            rdd_ts_dps = rdd_ts_info \
                .map(lambda x: (x[0], _ts_read(tsuid=tsuid, start_date=x[1], end_date=x[2])))

            # This RDD is used multiple times, caching it to speed up
            rdd_ts_dps.cache()

            if compute_value:
                # Compute metadata related to "value" information
                result = calc_qual_stats_value(tsuid,
                                               rdd_ts_dps,
                                               force_save=force_save)
                # Append to final results
                if tsuid in results:
                    results[tsuid].update(result[tsuid])
                else:
                    results.update(result)

            if compute_time:
                # Compute metadata related to "time" information
                result = calc_qual_stats_time(tsuid,
                                              rdd_ts_dps,
                                              force_save=force_save)
                # Append to final results
                if tsuid in results:
                    results[tsuid].update(result[tsuid])
                else:
                    results.update(result)

            # We don't need the cache anymore
            rdd_ts_dps.unpersist()
    except Exception as cause:
        raise IkatsException("Quality stats failure with ...", cause)
    finally:
        ScManager.stop()
    return ts_list, results
예제 #27
0
def unwrap_ts_list(ts_list,
                   unit=TSUnit.Radians,
                   discontinuity=None,
                   fid_pattern="%(fid)s__unwrap",
                   use_spark=True):
    """
    Unwrap a list of TS by changing deltas between values to 2*discontinuity complement.
    Unwrap phase of each TS composing the dataset

    :param ts_list: list of TSUID to unwrap
    :param unit: TS unit : "Degrees" or "Radians" (default)
    :param discontinuity: Maximum discontinuity between values.
    :param fid_pattern: Pattern of the new FID ('%(fid)s' will be replaced by original FID)
    :param use_spark: Set to True to use spark. True is default

    :type ts_list: list
    :type unit: str or TSUnit
    :type discontinuity: float or None
    :type fid_pattern: str
    :type use_spark: bool

    :return: a new ts_list
    :rtype: list

    :raises TypeError: if input is not well formatted
    """

    if not isinstance(ts_list, list) or len(ts_list) == 0:
        raise TypeError("ts_list shall be a list having at least one TS")

    if discontinuity is None:
        raise ValueError("Discontinuity is not filled")

    results = []
    if use_spark:
        # Get Spark Context
        spark_context = ScManager.get()

        try:

            # Parallelize 1 TS = 1 partition
            rdd_ts_list = spark_context.parallelize(ts_list, len(ts_list))

            rdd_results = rdd_ts_list.map(
                lambda x: unwrap_tsuid(tsuid=x["tsuid"],
                                       fid=x["funcId"],
                                       fid_pattern=fid_pattern,
                                       discontinuity=discontinuity,
                                       unit=unit))

            # Persist data to not recompute them again
            # (Functional identifier reservation called multiple times through IkatsApi.ts.create_ref)
            rdd_results.cache()

            timings = rdd_results.map(lambda x: x[1]).reduce(
                lambda x, y: x + y)

            results = rdd_results.map(lambda x: x[0]).collect()

            rdd_results.unpersist()

            LOGGER.debug("Unwrapping %s TS using Spark: %s", len(ts_list),
                         timings.stats())
        finally:
            # Stop the context
            ScManager.stop()
    else:
        timings = Timings()
        for item in ts_list:
            tsuid = item["tsuid"]
            fid = item["funcId"]
            result, tsuid_timings = unwrap_tsuid(tsuid=tsuid,
                                                 fid=fid,
                                                 fid_pattern=fid_pattern,
                                                 discontinuity=discontinuity,
                                                 unit=unit)
            results.append(result)
            timings += tsuid_timings

        LOGGER.debug("Unwrapping %s TS: %s", len(ts_list), timings.stats())
    return results
예제 #28
0
def spark_ccf(tdm,
              tsuid_list_or_dataset,
              lag_max=None,
              tsuids_out=False,
              cut_ts=False):
    """
    This function calculates the maximum of the cross correlation function matrix between all ts
    in **tsuid_list_or_dataset** IN A DISTRIBUTED MODE (using spark)

    Cross correlation is a correlation between two timeseries whose one is delayed of successive lag
    values. Result of CCF is a timeseries (correlation function of the lag between timeseries).
    This function keep the maximum value of the CCF function generated and pull it in the matrix for
    corresponding timeseries couple.

    :returns: a string matrix (whose size is equal to the number of tsuids in tsuid_list_or_dataset
              plus one line and one column for headers)
    :rtype: ndarray

    :param tdm: Temporal Data Manager client
    :param tsuid_list_or_dataset: list of identifiers of the time series or dataset name
    :param lag_max: maximum lag between timeseries (cf. _ccf function for more details)
    :param tsuids_out: True to fill headers with tsuids
                       False to fill headers with functional ids
    :param cut_ts: Cut the TS list to the min-length if set to True

    :type tdm: TemporalDataMgr
    :type tsuid_list_or_dataset: list of str or str
    :type lag_max: positive int
    :type tsuids_out: boolean
    :type cut_ts: bool

    :raises TypeError: if tdm is not a TemporalDataMgr
    :raises TypeError: if tsuid_list_or_dataset is not a list nor a string
    :raises TypeError: if tsuids_out is not a boolean
    """
    if type(tdm) is not TemporalDataMgr:
        raise TypeError("tdm must be a TemporalDataMgr")

    if type(tsuid_list_or_dataset) is not list and type(
            tsuid_list_or_dataset) is not str:
        raise TypeError(
            "tsuid_list_or_dataset must be a list of string OR a string")

    if type(tsuids_out) is not bool:
        raise TypeError("tsuids_out must be a boolean")

    if type(cut_ts) is not bool:
        raise TypeError("cut_ts must be a boolean")

    if type(tsuid_list_or_dataset) is list:
        # input is a list of tsuid
        tsuid_list = tsuid_list_or_dataset
    else:
        # input is a dataset name
        dataset = tdm.get_data_set(tsuid_list_or_dataset)
        tsuid_list = dataset['ts_list']

    if tsuids_out:
        ts_list = tsuid_list
    else:
        ts_list = __retrieve_func_id(tdm, tsuid_list)

    md_list = tdm.get_meta_data(tsuid_list)

    # initialize size of time series
    min_ts_size = md_list[tsuid_list[0]]['qual_nb_points']

    if cut_ts:
        for ts in tsuid_list:
            min_ts_size = min(min_ts_size, md_list[ts]['qual_nb_points'])
    else:
        # check time series have same length
        for ts in tsuid_list:
            size_ts = md_list[ts]['qual_nb_points']
            if size_ts != min_ts_size:
                raise ValueError('time series do not have same length')

    # Create or get a spark Context
    sc = ScManager.get()

    # Build the RDD with TSUIDS
    rdd = sc.parallelize(tsuid_list)

    # Create a broadcast for spark jobs
    broadcast = sc.broadcast({
        "host": tdm.host,
        "port": tdm.port,
        "size_of_ts": min_ts_size,
        "lag_max": lag_max
    })

    # Create an accumulator to store the results of the spark workers
    accumulator = sc.accumulator(dict(), ListAccumulatorParam())

    def run_ccf_spark(working_tsuids):
        """
        Method called by spark job
        :param working_tsuids: rdd item
        :type working_tsuids: tuple
        """

        # cross correlation is equal to 1 if timeseries are the same
        if working_tsuids[0] == working_tsuids[1]:
            result = 1
        else:
            spark_tdm = TemporalDataMgr(host=broadcast.value['host'],
                                        port=broadcast.value['port'])

            result = __run_max_ccf_ts_list(tdm=spark_tdm,
                                           tsuids=list(working_tsuids),
                                           size=int(
                                               broadcast.value['size_of_ts']),
                                           lag_max=broadcast.value['lag_max'])

        accumulator.add({";".join(list(working_tsuids)): result})

    # Get TS content and perform ccf calculation using spark distribution to increase performance
    # for each element of rdd which is a couple of timeseries
    # the list of couples is first sorted then duplicates are suppressed to avoid doing same calculation
    # as for (a,b) and (b,a)
    rdd.cartesian(rdd).map(
        lambda x: tuple(sorted(list(x)))).distinct().foreach(run_ccf_spark)

    # Retrieving result from accumulator to fill matrix result
    ts_nb = len(tsuid_list)
    matrix_corr = np.zeros((ts_nb, ts_nb))
    for str_couple in accumulator.value:
        couple = str_couple.split(';')
        matrix_corr[
            tsuid_list.index(couple[0]),
            tsuid_list.index(couple[1])] = accumulator.value[str_couple]
        matrix_corr[
            tsuid_list.index(couple[1]),
            tsuid_list.index(couple[0])] = accumulator.value[str_couple]

    # fill final matrix with headers
    matrix = __fill_headers_to_final_matrix(matrix_corr, ts_list)

    return matrix
예제 #29
0
def correlation_ts_list_loop(ts_list,
                             corr_method,
                             context_meta,
                             variable_meta='metric',
                             config=ConfigCorrelationLoop(
                                 the_num_partitions=24,
                                 the_point_cache_size=50e6,
                                 the_digits_number=4)):
    """
    Computes the correlations between timeseries selected by observed variables and contexts.

    The observed contexts are defined by the context_meta argument.
    The variables are defined by variable_meta argument.

    Assumed:
      - Each context has a list of distinct variables.
      - Each timeseries is uniquely associated to one context and one variable.

    Example with Airbus data:
      - the *context* is a flight in an Airbus dataset of timeseries.
      - the *variables* could be metric 'WS1', metric 'WS2' etc.

    This algorithm is spark-distributed on the cluster.

    Spark summary
    *************

      - **step 1** The driver prepares a set of configured tuples: each tuple is configured for one context,
               and has a list of (variable, timeseries reference). Timeseries references are tsuids.

      - **step 2** A RDD is initialized from the set of cells **'configured tuples'**

      - **step 3** A new RDD is computed from step 2: each cell **'configured tuple'** is transformed into list of
        **'correlation inputs'**: this cell is prepared to be processed by the correlation method, for a
        subpart of the correlation matrice computed for one context

        At this step, each task task executes: *_spark_combine_pairs()*

      - **step 4** A new RDD is computed as set of **'correlation result'** cells from cells **'correlations inputs'**:
        each task will read timeseries pairs, compute the correlation result from selected method (Pearson, ...)

        At this step, each task task executes: *_spark_correlate_pairs()*

      - **step 5**: aggregates **'correlation result'** by variable pairs into RDD of
        **'aggregated correlations'** cells. Each task will

        1. creates and saves low-level results CorrelationsByContext into IKATS database, as JSON content.

          .. seealso:: the JSON is described in the
            ikats.algo.correlation.data.CorrelationDataset::get_json_friendly_dict()

        2. returns **'aggregated correlation'** cells providing

          - pair of variable indexes
          - aggregated values: Mean, Variance
          - saved reference of CorrelationsByContext

        At this step, each task executes: *_spark_build_corrs_by_context()*

      - **step 6**: the driver collects the RDD of **'aggregated correlations'**, and computes the high-level result,
        which is a CorrelationDataset.

        Finally the JSON generated by CorrelationDataset is returned.

    :param ts_list: selected timeseries list on which are computed the correlations
    :type ts_list: list
    :param corr_method: the method computing the correlation between 2 timeseries.

      The value must be in CORRELATION_METHODS.

      Choose PEARSON to apply the pearson correlation.
    :type corr_method: str
    :param context_meta: name of the metadata identifying each observed context,
      where correlations are computed.

      .. note:: this metadata shall exist for each timeseries, otherwise the
        latter will be ignored.

      With Airbus example: 'FlightIdentifier' identifies the flight as observed context.

    :type context_meta: str
    :param variable_meta: Optional, with default value 'metric',
      the name of the metadata identifying the variables.

      .. note:: this metadata shall exist for each timeseries, otherwise the
        latter will be ignored.

      The metadata values will be sorted in a list providing the effective indexes of matrices:
      the correlation matrix: the N-th index is reserved to the timeseries having the N-th value of
      this metadata in alphanumeric order.

      It is advised to keep the default value: this advanced argument must provide distinct indexes for each
      timeseries under same observed context.

    :type variable_meta: str
    :return: JSON-friendly dict grouping

      - Matrix of means of correlations (see step5)

      - Matrix of variances of correlations (see step5)

      - Matrix of references to the JSON content of CorrelationByContext (see step 5)

      .. seealso:: detailed JSON structure in
        ikats.algo.correlation.data.CorrelationDataset::get_json_friendly_dict()

    :rtype: dict as json-friendly structure for json library
    :raise exception: IkatsException when an error occurred while processing the correlations.
    """

    sc = None

    try:
        LOGGER.info("Starting correlation loop ...")
        LOGGER.info(" - observed contexts based on: %s", context_meta)
        LOGGER.info(" - variables ordered by: %s", variable_meta)

        # Check parameters
        corr_func = CORRELATION_FUNCTIONS.get(corr_method, None)
        if corr_func is None:
            msg = "Unknown correlation method from CORRELATION_FUNCTIONS: corr_method={}"
            raise IkatsException(msg.format(corr_method))

        if type(ts_list) is not list:
            msg = "Unexpected type: list expected for ts_list={}"
            raise IkatsException(msg.format(msg.format(ts_list)))

        if type(context_meta) is not str or len(context_meta) == 0:
            msg = "Unexpected arg value: defined str is expected for context_meta={}"
            raise IkatsException(msg.format(msg.format(context_meta)))
        if type(variable_meta) is not str or len(variable_meta) == 0:
            msg = "Unexpected arg value: defined str is expected for variable_meta={}"
            raise IkatsException(msg.format(msg.format(variable_meta)))

        # Hyp: the metadata part can be loaded from the driver

        ts_metadata_dict = IkatsApi.md.read(ts_list)

        # Note: the algorithm discards the variables X without Corr(X,Y) for Y different from X
        #       but when X is retained, the final result will present the Corr(X,X) beside the Corr(X,Y)
        corr_loop_config, sorted_contexts, sorted_variables = _initialize_config_from_meta(
            ts_metadata_dict,
            context_meta=context_meta,
            variable_meta=variable_meta)

        LOGGER.info("- sorted_contexts=%s", sorted_contexts)
        LOGGER.info("- sorted_variables=%s", sorted_variables)

        nb_contexts = len(sorted_contexts)

        if nb_contexts * len(sorted_variables) == 0:
            # Algo simply return empty result when there is no variable or no context consistent
            #
            # - case 1: case when there is no computable Corr(X, Y)
            #           where variables X and Y are different for the same context
            # - case 2: missing metadata for context_name => no context
            # - case 3: missing metadata for ordering_meta => no variable
            #
            LOGGER.warning("Empty result from selection=%s", ts_list)
            obj_empty_result = CorrelationDataset()
            obj_empty_result.set_contexts(contexts=sorted_contexts,
                                          meta_identifier=context_meta)
            obj_empty_result.set_variables(labels=sorted_variables)
            obj_empty_result.add_matrix(matrix=[],
                                        desc_label="Empty Mean correlation")
            obj_empty_result.add_matrix(
                matrix=[], desc_label="Empty Variance correlation")
            obj_empty_result.add_rid_matrix(matrix=[])

            return obj_empty_result.get_json_friendly_dict()

        # Computes the number of matrix chunks
        # one matrix chunk will be handled by one task at
        # -------------------------------------
        if nb_contexts < config.num_partitions:
            # Case when there are fewer contexts than recommended partitions:
            # - the computing of one matrix is split into several chunks
            nb_matrix_blocks = ceil(float(config.num_partitions) / nb_contexts)
        else:
            nb_matrix_blocks = 1

        LOGGER.info("- number of matrix blocks by context=%s",
                    nb_matrix_blocks)

        # Computes the timeseries LRU cache size used by one task
        # -------------------------------------------------------
        # 1/ retrieve nb points for each TS, default value is assumed to be 1e6 in order to be robust
        # in case 'qual_nb_points' is not available, (should not happen ...)
        defined_nb_points = [
            int(v.get('qual_nb_points', 1e6))
            for v in ts_metadata_dict.values()
        ]
        # 2/ evaluate the number of points by one task carrying one matrice chunk
        total_nb_points_by_ctx = sum(
            defined_nb_points) / nb_contexts / nb_matrix_blocks
        if config.the_point_cache_size >= total_nb_points_by_ctx:
            # the best condition:
            # system will memorize in the cache every loaded ts under the same matrice
            ts_cache_size = len(sorted_variables)
        else:
            # the case when it is required to limit the number TS memorized in the cache,
            # under the same row of correlation matrice
            # Note: len(sorted_variables) == max size of correlation row == dim matrice
            ts_cache_size = config.the_point_cache_size / total_nb_points_by_ctx * len(
                sorted_variables)
            ts_cache_size = ceil(max(2.0, ts_cache_size))
        LOGGER.info("- ts_cache_size=%s", ts_cache_size)

        # release ts_metadata_dict from memory
        ts_metadata_dict = None

        sc = ScManager.get()

        # Spark_step_1: initialize the RDD
        # ------------
        # OUTPUT: RDD of ( <context index>, [ (<var index 1> , <tsuid 1>), ..., (<var index N> , <tsuid N>) ] )

        rdd_initial_config = sc.parallelize(corr_loop_config,
                                            config.num_partitions)

        # Spark_step_2: combinate the pairs of timeseries by contexts and by chunks
        # ------------
        # INPUT:  RDD of ( <context index>, [ (<var index 1> , <tsuid 1>), ..., (<var index N> , <tsuid N>) ] )
        # OUTPUT: RDD of ( <context_index>, [ <pair 1_2>, <pair 1_3>, ..., <pair M_N> ] )
        #
        #    where <pair X_Y> is ((<var X index>, <tsuid X> ), (<var Y index>, <tsuid Y>))
        #
        # PROCESS: computes the cartesian product and split the list of pairs into smaller-sized lists
        #
        rdd_var_combinations = rdd_initial_config.flatMap(
            lambda x: _spark_combine_pairs(context=x[0],
                                           variables=x[1],
                                           nb_corr_matrix_blocks=
                                           nb_matrix_blocks))

        if nb_matrix_blocks > 1:
            # reshuffles all the data over the cluster ...
            rdd_var_combinations = rdd_var_combinations.repartition(
                nb_contexts * nb_matrix_blocks)

        # Spark_step_3: computes the correlations
        # ------------
        # INPUT:  RDD of ( <context_index>, [ <pair 1_2>, <pair 1_3>, ..., <pair M_N> ] )
        # OUTPUT: RDD of ( (<var X index>, <var Y index>), <computed corr X_Y> )
        #
        #  where
        #    <computed corr X_Y> is (<context>, (<tsuid X>, <tsuid Y>), correlation)
        #
        # PROCESS: computes the correlations on the timeseries associated to the variables
        #
        rdd_correlations = rdd_var_combinations.flatMap(
            lambda x: _spark_correlate_pairs(context=x[0],
                                             var_pairs=x[1],
                                             corr_method=corr_method,
                                             ts_cache_size=ts_cache_size))

        # generates the parent_id:
        #   presently this identifier may be used by Postgres admin,
        #   to group the low-level results attached to the same high-level result
        #   => at the moment a label including a timestamp is generated
        obj_result = CorrelationDataset()
        parent_id = obj_result.get_id()

        def r_append(data, computed_corr):
            """
            Append computed correlation to data
            :param data:
            :param computed_corr:
            :return:
            """
            data.append(computed_corr)
            return data

        def r_merge(one, two):
            """
            Merge two to one
            :param one:
            :param two:
            :return:
            """
            one.extend(two)
            return one

        # Spark_step_4: aggregate the correlations by pair of variables
        # ------------
        # INPUT: RDD of ( (<var X index>, <var Y index>), <computed corr X_Y> ) as described previously
        #
        # OUTPUT: RDD of ( (<var X index>, <var Y index>), list of tuples:
        #                                  (<context index>, (tsuid_X, tsuid_Y), <correlation result> )
        #                )
        # PROCESS: aggregates by key=(<var X index>, <var Y index>) the correlation information profiles,
        #          enhanced with tsuid pairs
        #
        rdd_agg_correlations = rdd_correlations.aggregateByKey(
            zeroValue=[], seqFunc=r_append, combFunc=r_merge)

        # Spark_step_5:
        # ------------
        # INPUT: RDD of  ( (<var X index>, <var Y index>), list of tuples:
        #                                  (<context index>, (tsuid_X, tsuid_Y), <correlation result> )
        #                )
        #
        # OUTPUT: RDD of ( ( <var X index>, <var Y index>), <low-level Result ID>, <Mean correlation>, <Var correlation>
        #                )
        # PROCESS: - creates and saves aggregated low-level results as CorrelationsByContext
        #          - computes Mean and Variance of low-level results
        #          - returns summarized info: Mean+Variance+ result ID
        rdd_results_corr_by_context = \
            rdd_agg_correlations.map(lambda x: (_spark_build_corrs_by_context(variables=x[0],
                                                                              agg_ctx_ts_corr=x[1],
                                                                              desc_context=context_meta,
                                                                              sorted_variables=sorted_variables,
                                                                              sorted_contexts=sorted_contexts,
                                                                              corr_method=corr_method,
                                                                              parent_id=parent_id,
                                                                              ndigits=config.the_digits_number)))

        # Spark_step_6:
        # ------------
        #
        # 6.1: collects
        #
        # INPUT: RDD of  ( [ <var X index>, <var Y index>], <processdata ID>, <Mean(corr)>, <Var(corr)>
        #                )
        #
        # OUTPUT: collected list
        #
        # PROCESS:  collects high-level results
        #
        collected_results_corr = rdd_results_corr_by_context.collect()

        # 6.2: prepare the result
        #
        #  - Encodes the returned json-friendly content from the collected high-level results
        #  - returns the result
        #
        matrix_mean = get_triangular_matrix(dim=len(sorted_variables),
                                            default_value_diag=1.0,
                                            default_value_other=None)

        matrix_variance = get_triangular_matrix(dim=len(sorted_variables),
                                                default_value_diag=0.0,
                                                default_value_other=None)

        matrix_id = get_triangular_matrix(dim=len(sorted_variables),
                                          default_value_diag=None,
                                          default_value_other=None)

        for var_index_pair, data_oid, mean, variance in collected_results_corr:
            var_index_row = var_index_pair[0]
            var_index_col = var_index_pair[1]
            # required: recomputes the range of cell in its row
            # triangular matrix => cell(i,j) is at position j-i of the row triangular_matrix[i]
            matrix_mean[var_index_row][var_index_col - var_index_row] = mean
            matrix_variance[var_index_row][var_index_col -
                                           var_index_row] = variance
            matrix_id[var_index_row][var_index_col - var_index_row] = data_oid

        obj_result.set_contexts(contexts=sorted_contexts,
                                meta_identifier=context_meta)

        obj_result.set_variables(sorted_variables)
        obj_result.add_matrix(matrix=matrix_mean,
                              desc_label="Mean Correlation")
        obj_result.add_matrix(matrix=matrix_variance, desc_label="Variance")
        obj_result.add_rid_matrix(matrix_id)

        LOGGER.info("... ended correlation loop.")
        return obj_result.get_json_friendly_dict()

    except Exception:
        LOGGER.error("... ended correlation loop with error.")
        raise IkatsException("Failed execution: correlation_ts_loop()")
    finally:
        if sc:
            ScManager.stop()
예제 #30
0
    def test_sliding_window_recovery(self):
        """
        Testing the recovery parameter.
        """
        sax_info = ConfigSax(paa=3,
                             sequences_size=6,
                             with_mean=True,
                             with_std=True,
                             global_norm=False,
                             local_norm=False,
                             linear_filter=False,
                             recovery=0.5,
                             coefficients=[1, 1],
                             alphabet_size=6)
        ts_name = ["linear_time_serie"]
        spark_ctx = ScManager.get()
        # Test with recovery = 0.5
        result, _ = sliding_windows(ts_list=ts_name,
                                    sax_info=sax_info,
                                    spark_ctx=spark_ctx)

        result = result.collect()
        # 2 sequences in the timeseries => 3 sequences at the end
        self.assertEqual(len(result), 3)

        # Test with MAX recovery
        # recovery = 1 (the maximum : 100 % <=> the next window start one point to the right)
        sax_info.recovery = 1.0
        result, _ = sliding_windows(ts_list=ts_name,
                                    sax_info=sax_info,
                                    spark_ctx=spark_ctx)
        result = result.collect()

        # remember that in 'sliding_window' function, we call 'get_ts_mock(ts_name)[0]'
        ts = get_ts_mock(ts_name)[0]
        ts_val_0 = list(ts[0:6][:, 1])
        ts_val_1 = list(ts[6:12][:, 1])
        timestamp_0 = list(ts[0:6][:, 0])
        timestamp_1 = list(ts[6:12][:, 0])

        # Check the timestamp and the values of the two sequences
        # result[i] = (key, list([timestamps, values],[,],...))

        # check ts value
        condition = (np.all(result[i][1][:, 1] in ts_val_0
                            for i in range(len(result)))
                     or np.all(result[i][1][:, 1] in ts_val_1
                               for i in range(len(result))))

        self.assertTrue(condition)

        # check timestamps
        condition = (np.all(result[i][1][:, 0] in timestamp_0
                            for i in range(len(result)))
                     or np.all(result[i][1][:, 0] in timestamp_1
                               for i in range(len(result))))
        self.assertTrue(condition)

        # Test with MINIMUM recovery
        # recovery = 0 (no recovery)
        sax_info.recovery = 0.01
        result2, _ = sliding_windows(ts_list=ts_name,
                                     sax_info=sax_info,
                                     spark_ctx=spark_ctx)
        result2 = result2.collect()
        # 2 sequences in the timeseries => 2 sequences
        self.assertEqual(len(result2), 2)