def make_libs_df(self, X, y):
        '''
        Helper method:
        Writes out class labels & examples to files usable by SmashMatch
            (delimited by spaces, each row is a timeseries, create a different file for each series)

        Inputs -
            X (pd.DataFrame): timeseries examples of each class, each row is a
                timeseries
            y (pd.Dataframe): labels for each timeseries

        Returns -
            lib_files (list of LibFile objects)
        '''

        if "level" not in X.columns:
            X.insert(0, "level", y)

        X.sort_values("level")

        labels = y.unique().tolist()
        class_num = 0

        lib_files = []
        for label_ in labels:
            df = X.loc[X.level == label_]
            df = df.drop("level", 1)
            fname = self.get_unique_name(True)
            write_series(input_data=df,
                         file_dir=self.__file_dir,
                         filename=fname)
            lib_files.append(LibFile(class_num, label_, fname))
            class_num += 1
        return lib_files
示例#2
0
    def get_dm(self, quantized, first_run, max_len=None, \
    num_get_dms=5, details=False):
        '''
        Helper function:
        Calls bin/smash to compute the distance matrix on the given input
        timeseries and write I/O files necessary for Data Smashing

        Inputs -
            max_len (int): max length of data to use
            num_get_dms (int): number of runs of Smash to compute distance
                matrix (refines results)
            details (boolean): do (True) or do not (False) show cpu usage of
                Data Smashing algorithm

        Outuputs -
            (numpy.ndarray) distance matrix of the input timeseries
            (shape n_samples x n_samples)
        '''

        if not first_run:
            os.unlink(self.__input_dm_fh.name)
            self.__command = (self.__bin_path + "/smash")

        if not quantized:
            self.__input_dm_fh, self.__input_dm_fname = write_series(input_data=self._data,\
                                                                    file_dir=self.__file_dir)
        else:
            self.__input_dm_fh, self.__input_dm_fname = write_series(input_data=self.__quantized_data,\
                                                                    file_dir=self.__file_dir)

        self.__command += " -f " + self.__input_dm_fname + " -D row -T symbolic"

        if max_len is not None:
            self.__command += (" -L " + str(max_len))
        if num_get_dms is not None:
            self.__command += (" -n " + str(num_get_dms))
        if not details:
            self.__command += (" -t 0")

        self.__output_dm_fname = str(uuid.uuid4())
        self.__output_dm_fname = self.__output_dm_fname.replace("-", "")
        self.__command += (" -o " + self.__output_dm_fname)

        prev_wd = os.getcwd()
        os.chdir(self.__file_dir)
        sp.Popen(self.__command, shell=True, stderr=sp.STDOUT).wait()
        os.chdir(prev_wd)

        try:
            results = np.loadtxt(fname=(self.__file_dir + "/" +
                                        self.__output_dm_fname),
                                 dtype=float)
            return results
        except IOError:
            print "Error: Smash calculation unsuccessful. Please try again."
    def compute(self, X, input_length, num_repeats, no_details, force):
        '''
        Helper method:
        Calls SmashMatch on the specified data with the parameters specified;
            creates command string for SmashMatch binary and calls using
            SmashMatch; udpates internal variables

        Inputs -
            X (numpy.ndarray or pandas DataFrame): input data (each row is a
                different timeseries)
            input_length (int): length of the input timeseries to use
            num_repeats (int): number of times to run SmashMatch (for refining
                results)
            no_details (boolean): do not print SmashMatch processer usage and
                speed while running classification
            force (boolean): force re-classification on current dataset

        Outputs -
            (boolean) whether SmashMatch results corresponding to X were created/exist
        '''

        if force or self.should_calculate(
                X):  # dataset was not the same as before or first run
            if force:
                self.reset_input()

            if isinstance(X, np.ndarray):
                self.__input = X
                input_name_command = " -f " + self.write_series_nda(X)
            elif isinstance(X, pd.DataFrame):  # being explicit
                self.__input = X
                fname = self.get_unique_name(False)
                write_series(input_data=X,
                             file_dir=self.__file_dir,
                             filename=fname)
                input_name_command = " -f " + fname
            else:  # theoretically should be impossible, but to be explicit
                raise ValueError(
                    "Error: unsupported types for X. X can only be of type \
                numpy.ndarray or pandas.DataFrame.")

            if input_length is not None:
                input_length_command = " -x " + str(input_length)

            self.__command += (input_name_command + self.__lib_command +
                               "-T symbolic -D row ")
            self.__command += ("-L true true true -o " + PREFIX + " -d false")
            self.__command += (" -n " + str(num_repeats))

            if input_length is not None:
                self.__command += input_length_command
            if no_details:
                self.__command += " -t 0"

            os.chdir(self.__file_dir)
            sp.Popen(self.__command, shell=True, stderr=sp.STDOUT).wait()
            os.chdir(CWD)

            if not self.has_smashmatch(
            ):  # should theoretically be impossible \
                # to return False, but for safety
                return False
            else:  # successfully ran smashmatch to get results
                return True
        else:  # dataset was the same as before, use existing result files
            return True