Exemplo n.º 1
0
    def test_sparse_matrix_get_column(self):
        """
        Tests the SparseMatrix get_column
        """
        # original collision matrix not directly used by code
        #  => useful to understand the test
        #                            [[0, 1, 0, 0, 4],
        #                             [1, 0, 5, 6, 7],
        #                             [0, 5, 0, 8, 9],
        #                             [0, 6, 8, 0, 0],
        #                             [4, 7, 9, 0, 0]]
        #
        # equivalent content using triangular matrix, actually used:
        # => this vis the version is implemented in random_proj
        original_mat = np.array([[0, 0, 0, 0, 0], [1, 0, 0, 0, 0],
                                 [0, 5, 0, 0, 0], [0, 6, 8, 0, 0],
                                 [4, 7, 9, 0, 0]])

        sparse_from_triangular = SparseMatrix(original_mat)

        # should find row indexes from original matrix
        #  column [0, 1, 0, 0, 4] in original matrix => rows 1 and 4
        self.assertListEqual(sparse_from_triangular.get_column(0), [1, 4])
        #  column [1, 0, 5, 6, 7] in original matrix => rows 0, 2, 3, 4
        self.assertListEqual(sparse_from_triangular.get_column(1),
                             [0, 2, 3, 4])
        #  column [0, 5, 0, 8, 9] in original matrix => rows 1, 3, 4
        self.assertListEqual(sparse_from_triangular.get_column(2), [1, 3, 4])
        #  column [0, 6, 8, 0, 0] in original matrix => rows 1, 2
        self.assertListEqual(sparse_from_triangular.get_column(3), [1, 2])
        #  column [4, 7, 9, 0, 0 in original matrix => rows 0, 1, 2
        self.assertListEqual(sparse_from_triangular.get_column(4), [0, 1, 2])
Exemplo n.º 2
0
    def _apply_motif_iter_zero_coll(self, activate_spark):
        """
        Test
         - with the iterative method to search the neighborhood motif,
         - with/without spark jobs
         - and where the words are all different => no collisions
        """
        spark_context = ScManager.get()
        # Build the SAX result with different words, and small breakpoints
        sax_result = SaxResult(paa=spark_context.parallelize([]),
                               breakpoints=[-0.3, -0.1, 0.1, 0.3],
                               sax_word='abcdebcdeacdeabdeabceabcd')
        sax, _, nb_seq = sax_result.start_sax(5, spark_ctx=spark_context)
        # sax is an rdd -> to np.array
        sax = np.transpose(sax.collect())

        breakpoint = sax_result.build_mindist_lookup_table(nb_seq)

        # Different words => only zero cells in the collision matrix
        collision_matrix = SparseMatrix(np.zeros((nb_seq, nb_seq)))

        # Build the class for motif search
        search_info = NeighborhoodSearch(size_sequence=20,
                                         mindist_lookup_table=breakpoint,
                                         alphabet_size=5,
                                         sax=np.transpose(sax),
                                         radius=1000,
                                         collision_matrix=collision_matrix)

        recognition_info = ConfigRecognition(
            is_stopped_by_eq9=True,
            iterations=100,
            min_value=1,
            is_algo_method_global=False,
            activate_spark=activate_spark,
            radius=1000,
            neighborhood_method=OPT_USING_BRUTE_FORCE)

        # neighborhood_method=OPT_USING_BRUTE_FORCE
        result = search_info.motif_neighborhood_iterative(30, recognition_info)

        # There is no similar sequences
        self.assertEqual(len(result), 0)

        # neighborhood_method=OPT_USING_COLLISIONS
        recognition_info.neighborhood_method = OPT_USING_COLLISIONS
        result = search_info.motif_neighborhood_iterative(30, recognition_info)

        # There is no similar sequences
        self.assertEqual(len(result), 0)
Exemplo n.º 3
0
    def test_sparse_matrix_init(self):
        """
        Tests the SparseMatrix init
        """
        # original collision matrix not used:
        #                            [[0, 1, 2, 3, 4],
        #                             [1, 0, 5, 6, 7],
        #                             [2, 5, 0, 8, 9],
        #                             [3, 6, 8, 0, 10],
        #                             [4, 7, 9, 10, 0]]
        #
        # equivalent content using triangular matrix, actually used:
        # => this vis the version is implemented in random_proj
        original_mat = np.array([[0, 0, 0, 0, 0], [1, 0, 0, 0, 0],
                                 [2, 5, 0, 0, 0], [3, 6, 8, 0, 0],
                                 [4, 7, 9, 10, 0]])

        sparse_from_triangular = SparseMatrix(original_mat)

        self.assertEqual(len(sparse_from_triangular.data), 4 + 3 + 2 + 1)

        for coll, (ind_a, ind_b) in sparse_from_triangular.data:
            self.assertEquals(coll, original_mat[ind_a, ind_b])
Exemplo n.º 4
0
def random_projections(ts_list, sax_info, collision_info, recognition_info):
    """
    The Random Projections Algorithm
    ================================

    This algorithm does the following (detailed for 1 TS but valid for many TS):
        * Apply the sliding window
        * Normalize the TS (global or/and local)
        * Filter the linear sequences (optional) and trivial matches
        * Apply the SAX algorithm
        * Build the collision matrix
        * Find the largest value cells in the collision matrix
        * Search the motif neighborhood

        ..note::
            The algorithm can produce "paa values" (numeric) for each sequence. The problem is the huge length of the
            results.

    **Catalogue implementation is provided**: main_random_projections() is calling random_projections() once all
    configurations ConfigSAX, ConfigCollision, ConfigRecognition are initialized.

    :param ts_list: list of TSUID
    :type ts_list: list

    :param sax_info: the information to make the sliding window and the sax_algorithm
    :type sax_info: ConfigSax

    :param collision_info: the information to build the collision matrix
    :type collision_info: ConfigCollision

    :param recognition_info: the information to made the pattern _recognition
    :type recognition_info: ConfigRecognition

    :return: the list of similar sequences, the sax result, the equation 9 result, and the sequences list
    :type: list, str, float, list
    """
    LOGGER.info("Configurations deduced from user parameters:")
    LOGGER.info("- sliding sax nb paa=%s", sax_info.paa)
    LOGGER.info("- sliding sax alphabet size=%s", sax_info.alphabet_size)
    LOGGER.info("- sliding sax sequences_size=%s", sax_info.sequences_size)
    LOGGER.info("- collision nb indexes=%s", collision_info.index)
    LOGGER.info("- collision nb iterations=%s", collision_info.nb_iterations)
    LOGGER.info("- collision accepted errors=%s", collision_info.errors)
    LOGGER.info("- recognition min_value=%s", recognition_info.min_value)
    LOGGER.info("- recognition iterations=%s", recognition_info.iterations)
    LOGGER.info("- recognition similarity radius=%s", recognition_info.radius)

    # Create or get a spark Context
    LOGGER.info("Running using Spark")
    spark_ctx = ScManager.get()

    # INPUT : all the TS { "ts_name" : [[time1, value1],...], "ts_name2": ... }
    # OUTPUT :  rdd_sequences_list = [ (key, sequence), ... ]
    # rdd_normalization_coefficients = [ (same_key,(un-normalized seq_mean, un-normalized seq_sd)), ...]
    # PROCESS : *sliding_windows* create sequences for each TS (results are RDDs)
    rdd_sequences_list, rdd_normalization_coefficients = sliding_windows(ts_list=ts_list,
                                                                         sax_info=sax_info,
                                                                         spark_ctx=spark_ctx,
                                                                         trivial_radius=recognition_info.radius / 2)
    # INPUT : rdd_sequences_list = [ (key, sequence), ... ]
    # OUTPUT : rdd_sax_result is a SaxResult object containing
    #  * paa (rdd of flatMap) : rdd of large list of all the paa_values concatenated
    #  * breakpoints (list) : list of the breakpoints (len = sax_info.alphabet_size - 1)
    #  * sax_word (large str): large string of all the SAX words concatenated
    # PROCESS : Give the SAX form of the sequences
    rdd_sax_result = run_sax_on_sequences(rdd_sequences_data=rdd_sequences_list,
                                          paa=sax_info.paa,
                                          alphabet_size=sax_info.alphabet_size)

    # INPUT : rdd_sequences_list = [ (key, sequence), ... ]
    # OUTPUT : sequences_list = { key: sequence, ...} NOT AN RDD!
    # PROCESS : transform rdd_sequences_list elements into dict
    sequences_list = rdd_sequences_list.collectAsMap()

    # INPUT : rdd_normalization_coefficients = [ (same_key,(un-normalized seq_mean, un-normalized seq_sd)), ...]
    # OUTPUT : sequences_list = { key: (un-normalized seq_mean, un-normalized seq_sd), ...} NOT AN RDD!
    # PROCESS : transform rdd_normalization_coefficients elements into dict
    normalization_coefficients = rdd_normalization_coefficients.collectAsMap()

    # Keep only necessary information of each sequence
    sequences_list = sequences_info(sequences_list, normalization_coefficients)

    # *paa_sequence* is a "conversion" of *sax* from letters to numbers (matrix with same shape)
    # (usefull for past-processing the random projection algorithm).
    breakpoints = [str(i) for i in rdd_sax_result.breakpoints]

    # Build the table which give the distance between two letters (need just sax_result.breakpoints)
    mindist_lookup_table = rdd_sax_result.build_mindist_lookup_table(sax_info.alphabet_size)

    # Give the SAX result in a array (need rdd_sax_result.sax_word and sax_result.paa)
    rdd_sax, paa_result, number_of_sequences = rdd_sax_result.start_sax(sax_info.paa, spark_ctx=spark_ctx)

    LOGGER.info("- filtered number of words=%s", number_of_sequences)

    if number_of_sequences == 1:
        LOGGER.info("- sliding window find just one sequence, no collision matrix computed.")
        collision_matrix = SparseMatrix(np.array([[0]]))
    else:

        # Build the collision matrix, the number of iteration can change
        # (if the len of a sequence is too small for example nb_iteration can be < nb_iteration specified)
        collision_matrix, collision_info.nb_iterations = final_collision_matrix(
            sax=rdd_sax,
            number_of_iterations=collision_info.nb_iterations,
            index_selected=collision_info.index,
            word_len=sax_info.paa,
            spark_ctx=spark_ctx)

    # *collision_matrix* is a sparse matrix : light in memory

    # Give the result of the Equation 9
    eq9_result = equation9(number_of_sequences=number_of_sequences,
                           size_alphabet=sax_info.alphabet_size,
                           size_word=sax_info.paa,
                           errors=collision_info.errors,
                           index_selected=collision_info.index,
                           iterations=collision_info.nb_iterations)

    sax = rdd_sax.collect()
    paa_result = np.transpose(paa_result)

    distance_info = NeighborhoodSearch(size_sequence=sax_info.sequences_size,
                                       mindist_lookup_table=mindist_lookup_table,
                                       alphabet_size=sax_info.alphabet_size,
                                       sax=sax,
                                       radius=recognition_info.radius,
                                       collision_matrix=collision_matrix)

    LOGGER.info("- theoretical Eq9 limit: min collisions = %s for accepted errors=%s", eq9_result,
                collision_info.errors)

    # Check the eq9_result with min_value
    if eq9_result < recognition_info.min_value:
        LOGGER.warning("- setting Eq9 limit to min_value=%s: because Eq9 < min_value", recognition_info.min_value)
        eq9_result = recognition_info.min_value
    if eq9_result < 1:
        LOGGER.warning("- setting Eq9 limit to 1: because Eq9 < 1")
        eq9_result = 1

    # find the motif neighborhood by using the largest value cells in the collision matrix
    if recognition_info.is_algo_method_global is True:
        algo_result = distance_info.motif_neighborhood_global(eq9_result, recognition_info)
    else:
        algo_result = distance_info.motif_neighborhood_iterative(eq9_result, recognition_info)

    # Give the results with the names of sequences and not their number in the collision matrix
    algo_result = result_on_sequences_form(algo_result, sequences_list, sax, sax_info.alphabet_size, paa_result)

    algo_result = result_on_pattern_form(algo_result)

    # Give the alphabet used in the SAX algorithm
    alphabet = start_alphabet(sax_info.alphabet_size)

    result = {'patterns': algo_result,
              'break_points': breakpoints,
              'disc_break_points': alphabet}

    if spark_ctx is not None:
        ScManager.stop()
        LOGGER.info("Ended Spark session.")

    return result
Exemplo n.º 5
0
    def _apply_motif_global_same_words(self, activate_spark):
        """
        Test
        - with the global method to search the neighborhood motif,
        - with/without spark jobs according to activate_spark
        - and where the words are all the same
        """
        spark_context = ScManager.get()
        # Build the SAX result with large breakpoints
        sax_result = SaxResult(paa=spark_context.parallelize([]),
                               breakpoints=[-300, -100, 100, 300],
                               sax_word='abcdeabcdeabcdeabcde')
        sax, _, _ = sax_result.start_sax(5, spark_ctx=spark_context)
        # sax is an rdd -> to np.array
        sax = np.transpose(sax.collect())

        breakpoint = sax_result.build_mindist_lookup_table(alphabet_size=5)

        # Build the collision matrix result
        collision_matrix = SparseMatrix(
            np.array([[
                0,
                0,
                0,
                0,
            ], [
                100,
                0,
                0,
                0,
            ], [
                100,
                100,
                0,
                0,
            ], [
                100,
                100,
                100,
                0,
            ]]))

        # two identical cases here: brute force / with collisions
        for method_opt in [OPT_USING_BRUTE_FORCE, OPT_USING_COLLISIONS]:
            #  mindist distances:
            #
            # [[ 0.  0.  0.  0.]
            #  [ 0.  0.  0.  0.]
            #  [ 0.  0.  0.  0.]
            #  [ 0.  0.  0.  0.]]

            # Build the class for motif search
            search_info = NeighborhoodSearch(size_sequence=20,
                                             mindist_lookup_table=breakpoint,
                                             alphabet_size=5,
                                             sax=np.transpose(sax),
                                             radius=0.01,
                                             collision_matrix=collision_matrix)

            recognition_info = ConfigRecognition(
                is_stopped_by_eq9=True,
                iterations=0,
                min_value=1,
                is_algo_method_global=True,
                activate_spark=activate_spark,
                radius=0.01,
                neighborhood_method=method_opt)

            # neighborhood_method=OPT_USING_BRUTE_FORCE (compare with all the words)
            result = search_info.motif_neighborhood_global(
                30, recognition_info)

            self._print_mindist_mat(search_info)

            # The words corresponding to the six largest values cells have a MINDIST < radius
            self.assertEqual(len(result), 1)
            # This results are the same : [0,1,2,3]: the 6 groups have been reduced to one inside
            self.assertEqual(result, [[0, 1, 2, 3]])
Exemplo n.º 6
0
    def _apply_iter_coll_no_spark_ex1(self, activate_spark):
        """
         Tests motif_neighborhood_iterative()
         - the iterative method
         - using the heuristic based upon collisions
         - to search the neighborhood motif

         Note: test where the words have only one different letter.
        """

        # Build the SAX result where the words have only one different letter (words: 5 letters)
        sequences = ["abcde", "abcdd", "abcdc", "abcdb", "abcda"]
        tested_sax_word = ''.join(sequences)
        spark_context = ScManager.get()
        sax_result = SaxResult(paa=spark_context.parallelize([]),
                               breakpoints=[-1.1, -1, 0, 1.501],
                               sax_word=tested_sax_word)
        sax, _, nb_seq = sax_result.start_sax(5, spark_ctx=spark_context)
        # sax is an rdd -> to np.array
        sax = np.transpose(sax.collect())

        breakpoint = sax_result.build_mindist_lookup_table(5)

        # Build a collision matrix
        # Note: this matrix is different from the one from
        #   test test_iterative__brute_no_spark_ex1:
        #    => see zeros are added: coll(3,2) == coll(4,2) == 0
        collision_matrix = SparseMatrix(
            np.array([[
                0,
                0,
                0,
                0,
                0,
            ], [
                40,
                0,
                0,
                0,
                0,
            ], [
                2,
                40,
                0,
                0,
                0,
            ], [
                4,
                8,
                0,
                0,
                0,
            ], [
                6,
                10,
                0,
                50,
                0,
            ]]))

        self._print_matrix("test_iterative__brute_no_spark_ex1",
                           collision_matrix.data, nb_seq)

        # mindist distances:
        # [[ 0.     0.     3.002  5.002  5.202]
        #  [ 0.     0.     0.     2.     2.2  ]
        #  [ 3.002  0.     0.     0.     0.2  ]
        #  [ 5.002  2.     0.     0.     0.   ]
        #  [ 5.202  2.2    0.2    0.     0.   ]]

        # Using neighborhood_method=OPT_USING_BRUTE_FORCE
        #
        # iterative:  examining collisions (i,j) per iteration:
        #             (3,4) then (1,2) +(0,1)
        #
        #             (collisions greater than min_value==25)
        #
        # Test with fixed radius 1.9:
        #    - iter=1    => result is [[3, 4]] considering (S3,S4) neighborhood
        #    - iter=2    => result extended with [0,1,2] considering (S0,S1), unchanged for (S1,S2)
        #    - iter=3    => result is the same than for iter=2: no more collision available
        #    - iter=100  => result is the same than for iter=2: no more collision available
        #
        for radius, nb_iter, expected_res in [[1.9, 1, [[3, 4]]],
                                              [1.9, 2, [[3, 4], [0, 1, 2]]],
                                              [1.9, 3, [[3, 4], [0, 1, 2]]],
                                              [1.9, 100, [[3, 4], [0, 1, 2]]]]:

            # Build the class for motif search where the min_value is 25
            search_info = NeighborhoodSearch(size_sequence=20,
                                             mindist_lookup_table=breakpoint,
                                             alphabet_size=5,
                                             sax=np.transpose(sax),
                                             radius=radius,
                                             collision_matrix=collision_matrix)

            # for info: here is the mindist:
            #  (see _print_mindist_mat doc: in order to activate print)
            self._print_mindist_mat(search_info)

            recognition_info = ConfigRecognition(
                is_stopped_by_eq9=True,
                iterations=nb_iter,
                min_value=25,
                is_algo_method_global=False,
                activate_spark=activate_spark,
                radius=radius,
                neighborhood_method=OPT_USING_COLLISIONS)

            result = search_info.motif_neighborhood_iterative(
                recognition_info.min_value, recognition_info)

            self.assertEqual(len(result), len(expected_res))
            for group in result:
                self.assertTrue(group in expected_res)
Exemplo n.º 7
0
    def _apply_motif_global_coll_ex1(self, activate_spark):
        """
        Test
          - with the global method to search the neighborhood motif,
          - with/without spark according to activate_spark
          - exploring similarities with collisions heuristic
          - with input: the words have only one different letter.  And every sequence
            Si has collisions with Sj with that matrix.

         Note: results ought to be equal to test_global_brute_no_spark_ex1
        """

        # Build the SAX result where the words have only one different letter (words: 5 letters)
        sequences = ["abcde", "abcdd", "abcdc", "abcdb", "abcda"]
        tested_sax_word = ''.join(sequences)
        spark_context = ScManager.get()
        sax_result = SaxResult(paa=spark_context.parallelize([]),
                               breakpoints=[-1.1, -1, 0, 1.501],
                               sax_word=tested_sax_word)
        sax, _, nb_seq = sax_result.start_sax(5, spark_ctx=spark_context)
        # sax is an rdd -> to np.array
        sax = np.transpose(sax.collect())

        breakpoint = sax_result.build_mindist_lookup_table(5)

        # Build a collision matrix (the real collision matrix is different, but we take this one for the test)
        collision_matrix = SparseMatrix(
            np.array([[
                0,
                0,
                0,
                0,
                0,
            ], [
                30,
                0,
                0,
                0,
                0,
            ], [
                2,
                40,
                0,
                0,
                0,
            ], [
                4,
                8,
                50,
                0,
                0,
            ], [
                6,
                10,
                20,
                60,
                0,
            ]]))

        self._print_matrix("test_global_coll_no_spark_ex1",
                           collision_matrix.data, nb_seq)

        # mindist distances:
        # [[ 0.     0.     3.002  5.002  5.202]
        #  [ 0.     0.     0.     2.     2.2  ]
        #  [ 3.002  0.     0.     0.     0.2  ]
        #  [ 5.002  2.     0.     0.     0.   ]
        #  [ 5.202  2.2    0.2    0.     0.   ]]

        # Using neighborhood_method=OPT_USING_COLLISIONS
        #
        #  for collisions (0,1) (1,2) (2,3) (3,4) greater than min_value==25
        #  and with the collisions heuristic: only sequences having collisions with Si or Sj are examined
        #
        # for radius 1.9  => global result is [[0, 1, 2], [0, 1, 2, 3, 4], [1, 2, 3, 4], [2, 3, 4]]
        #
        # for radius 2.5  => global result is [[0, 1, 2, 3, 4], [0, 1, 2, 3, 4], [1, 2, 3, 4], [1, 2, 3, 4]]
        #                                      => reduced to [[[0, 1, 2, 3, 4], [1, 2, 3, 4]]
        #
        # for radius 3.5  => global result is [[0, 1, 2, 3, 4], [0, 1, 2, 3, 4], [0, 1, 2, 3, 4], [1, 2, 3, 4]]
        #                                      => reduced to [[0, 1, 2, 3, 4], [1, 2, 3, 4]]
        #
        # for radius 6    => global result is [[0, 1, 2, 3, 4], [0, 1, 2, 3, 4], [0, 1, 2, 3, 4], [0, 1, 2, 3, 4]]
        #                                      => reduced to [[0, 1, 2, 3, 4]]
        #
        for radius, expected_res in [[2.5, [[0, 1, 2, 3, 4], [1, 2, 3, 4]]],
                                     [
                                         1.9,
                                         [[0, 1, 2], [0, 1, 2, 3, 4],
                                          [1, 2, 3, 4], [2, 3, 4]]
                                     ], [3.5, [[0, 1, 2, 3, 4], [1, 2, 3, 4]]],
                                     [6, [[0, 1, 2, 3, 4]]]]:

            # Build the class for motif search where the min_value is 25
            search_info = NeighborhoodSearch(size_sequence=20,
                                             mindist_lookup_table=breakpoint,
                                             alphabet_size=5,
                                             sax=np.transpose(sax),
                                             radius=radius,
                                             collision_matrix=collision_matrix)

            # for info: here is the mindist:
            #  (see _print_mindist_mat doc: in order to activate print)
            self._print_mindist_mat(search_info)

            recognition_info = ConfigRecognition(
                is_stopped_by_eq9=True,
                iterations=0,
                min_value=25,
                is_algo_method_global=True,
                activate_spark=activate_spark,
                radius=radius,
                neighborhood_method=OPT_USING_COLLISIONS)

            print("radius {}:expected:                 {}".format(
                radius, expected_res))
            result = search_info.motif_neighborhood_global(
                recognition_info.min_value, recognition_info)

            print("radius {}:->global with collisions: {}".format(
                radius, result))

            self.assertEqual(len(result), len(expected_res))
            for group in result:
                self.assertTrue(group in expected_res)