def test_12_extract_class_label(self):
     with self.assertRaises(Exception) as context:
         mdg.extract_tagged_info(TestMetaDataGetter.bad_file_name, 'lbb')
         self.assertTrue(
             'Filename format is incorrect. Pair of braces not found after given '
             'expression.' in context.exception)
 def test_11_extract_class_label(self):
     flare_class = mdg.extract_tagged_info(TestMetaDataGetter.bad_file_name,
                                           'lab')
     self.assertEqual(
         flare_class, '[B]',
         'Extracted class class_labels from file name is incorrect.')
 def test_10_extract_end_time(self):
     expected_time = '2011-01-24T[11:12:00]'
     actual_time = mdg.extract_tagged_info(TestMetaDataGetter.bad_file_name,
                                           'et')
     self.assertEqual(expected_time, actual_time, 'Expected != Actual')
 def test_09_extract_start_time(self):
     expected_time = '2011-01-24T[03:24:00]'
     actual_time = mdg.extract_tagged_info(TestMetaDataGetter.bad_file_name,
                                           'st')
     self.assertEqual(expected_time, actual_time, 'Expected != Actual')
 def test_08_extract_id(self):
     expected_id = '345[]'
     actual_id = mdg.extract_tagged_info(TestMetaDataGetter.bad_file_name,
                                         'id')
     self.assertEqual(expected_id, actual_id, 'Expected != Actual')
Пример #6
0
    def do_extraction(self,
                      params_name: list = None,
                      params_index: list = None,
                      features_name: list = None,
                      features_index: list = None,
                      first_k: int = None,
                      need_interp: bool = True,
                      partition: list = None,
                      proc_id: int = None,
                      verbose: bool = False,
                      output_list: list = None):
        """
        Computes (based on the metadata loaded in the constructor) all of the statistical
        features on the MVTS data (per time series; column-wise) and stores the results in the
        public class field `df_all_features`.

        Note that only if the configuration file passed to the class constructor contains a list
        of the desired parameters and features the optional arguments can be skipped. So,
        please keep in mind the followings:

            * For parameters: a selected list of parameters (i.e., column names in MVTS data) must
              be provided either through the configuration file or the method argument
              `params_name`. Also, the argument `params_index` can be used to work with a smaller
              list of parameters if a list of parameters is already provided in the config file.
            * For features: A selected list of parameters (i.e., statistical features available
              in `features.feature_collection.py`) MUST be provided, as mentioned above.

        :param params_name: (Optional) A list of parameter names of interest that can be used
                            instead of the list `MVTS_PARAMETERS` given in the config file. If
                            the list in the config file is NOT provided, then either this or
                            `params_index` MIST be given.
        :param params_index: (Optional) A list of column indices of interest that can be used
                             instead of the list `MVTS_PARAMETERS` given in the config file.
                             If the list in the config file is NOT provided, then either this or
                             `params_name` MUST be given.
        :param features_name: (Optional) A list of statistical features to be calculated on all
                              time series of each MVTS file. The statistical features are the
                              function names present in `features.feature_collection.py'. If they
                              are not provided in the config file (under `STATISTICAL_FEATURES`),
                              either this or `features_index` MUST be given.
        :param features_index: (Optional) A list of indices corresponding to the features
                               provided in the configuration file. If they are not provided in
                               the config file (under `STATISTICAL_FEATURES`), either this or
                               `features_names` MUST be given.
        :param first_k: (Optional) If provided, only the fist `first_k` MVTS files will be
                        processed. This is mainly for getting some preliminary results in case the
                        number of MVTS files is too large.
        :param need_interp: True if a linear interpolation is needed to alter the missing numerical
                            values. This only takes care of the missing values and will not
                            affect the existing ones. Set it to False otherwise. Default is True.
        :param partition: (only for internal use)
        :param proc_id: (only for internal use)
        :param verbose: If set to True, the program prints on the console which files are being
                        processed and what processes (if parallel) are doing the work. The default
                        value is False.
        :param output_list: (only for internal use)

        :return: None
        """
        is_parallel = False
        if proc_id is not None and output_list is not None:
            is_parallel = True
        # -----------------------------------------
        # Verify arguments
        # -----------------------------------------
        _evaluate_params(params_name,
                         params_index,
                         config_params_available=self.mvts_parameters
                         is not None)
        _evaluate_features(features_name,
                           features_index,
                           config_features_available=self.statistical_features
                           is not None)
        # -----------------------------------------
        # If features are provided using one of the optional arguments
        # override self.statistical_features with the given list.
        # -----------------------------------------
        if features_name is not None:
            self.statistical_features = features_name
        elif features_index is not None:
            self.statistical_features = [
                self.statistical_features[i] for i in features_index
            ]

        # -----------------------------------------
        # Get all files (or the first first_k ones).
        # -----------------------------------------
        all_csv_files = []
        if is_parallel:
            # Use the given `partition` instead of all csv files.
            all_csv_files = partition

        else:
            all_csv_files = self.all_mvts_paths
            # _, _, all_csv_files = next(walk(self.path_to_root))
            if first_k is not None:
                # Note: If `fist_k` was used in parallel version, it should have already been taken
                # into account in `do_execution_in_parallel`. So, no need to do it again.
                all_csv_files = all_csv_files[:first_k]
        # -----------------------------------------
        # If params are provided using one of the optional arguments,
        # override self.mvts_parameters with the given list.
        # -----------------------------------------
        if params_name is not None:
            self.mvts_parameters = params_name
        elif params_index is not None:
            self.mvts_parameters = [
                self.mvts_parameters[i] for i in params_index
            ]

        n_features = len(self.statistical_features)
        n = len(all_csv_files)
        p_parameters = len(self.mvts_parameters)
        t_tags = len(self.metadata_tags)

        if verbose:
            if is_parallel:
                print('\n\n\t-------------PID--{}---------------'.format(
                    proc_id))
            else:
                print('\n\n\t-----------------------------------'.format())

            print('\t\tTotal No. of time series:\t{}'.format(n))
            print('\t\tTotal No. of Parameters:\t\t{}'.format(p_parameters))
            print('\t\tTotal No. of Features:\t\t{}'.format(n_features))
            print('\t\tTotal No. of Metadata Pieces:\t\t{}'.format(t_tags))
            print(
                '\t\tOutput Dimensionality (N:{} X (F:{} X P:{} + T:{})):\t{}'.
                format(n, n_features, p_parameters, t_tags,
                       n * (n_features * p_parameters + t_tags)))
            print('\t-----------------------------------\n'.format())

        i = 1
        # -----------------------------------------
        # Loop through each csv file and extract its features
        # -----------------------------------------
        for f in all_csv_files:
            if not f.endswith('.csv'):
                continue

            if verbose:
                if is_parallel:
                    print('\t PID:{} --> Total Processed: {} / {}'.format(
                        proc_id, i, n))
                else:
                    console_str = '\t >>> Total Processed: {0} / {1} <<<'.format(
                        i, n)
                    sys.stdout.write("\r" + console_str)
                    sys.stdout.flush()

            # abs_path = path.join(self.path_to_root, f)
            abs_path = f
            df_mvts: pd.DataFrame = pd.read_csv(abs_path, sep='\t')

            # -----------------------------------------
            # Keep the requested time series of mvts only.
            # -----------------------------------------
            df_raw = pd.DataFrame(df_mvts[self.mvts_parameters], dtype=float)

            # -----------------------------------------
            # Interpolate to get rid of the NaN values.
            # -----------------------------------------
            if need_interp:
                df_raw = utils.interpolate_missing_vals(df_raw)

            # -----------------------------------------
            # Extract all the features from each column of mvts.
            # -----------------------------------------
            callable_features = extractor_utils.get_methods_for_names(
                self.statistical_features)
            extracted_features_df = extractor_utils.calculate_one_mvts(
                df_raw, callable_features)

            # -----------------------------------------
            # Extract the given meta data from this mvts name.
            # -----------------------------------------
            tags_dict = dict()
            for tag in self.metadata_tags:
                tags_dict.update({tag: utils.extract_tagged_info(f, tag)})

            # -----------------------------------------
            # Flatten the resultant dataframe and add the metadata. Suppose in the meta data,
            # some pieces of information such as id, class label, start time and end time are
            # provided. The row_df will then have these columns:
            #   ID | LAB | ST | ET | FEATURE_1 | ... | FEATURE_n
            # -----------------------------------------
            row_dfs = []
            for tag, extracted_info in tags_dict.items():
                row_dfs.append(pd.DataFrame({tag: [extracted_info]}))

            features_df = extractor_utils.flatten_to_row_df(
                extracted_features_df)
            row_dfs.append(features_df)
            row_df = pd.concat(row_dfs, axis=1)

            # -----------------------------------------
            # Append this row to 'df_all_features'
            # -----------------------------------------
            # if this is the first file, create the main dataframe, i.e., 'df_all_features'
            if i == 1:
                self.df_all_features = pd.DataFrame(row_df)
            else:
                # add this row to the end of the dataframe 'df_all_features'
                self.df_all_features = self.df_all_features.append(row_df)
            i = i + 1
            # LOOP ENDS HERE

        self.df_all_features.reset_index(drop=True, inplace=True)

        if verbose:
            if is_parallel:
                print('\n\t^^^^^^^^^^^^^^^^^^^^PID: {0}^^^^^^^^^^^^^^^^^^^^^'.
                      format(proc_id))
            print('\n\t{0} files have been processed.'.format(i - 1))
            print(
                '\tAs a result, a dataframe of dimension {} X {} is created.'.
                format(self.df_all_features.shape[0],
                       self.df_all_features.shape[1]))

        if is_parallel:
            output_list.append(self.df_all_features)