def make_libs_df(self, X, y): ''' Helper method: Writes out class labels & examples to files usable by SmashMatch (delimited by spaces, each row is a timeseries, create a different file for each series) Inputs - X (pd.DataFrame): timeseries examples of each class, each row is a timeseries y (pd.Dataframe): labels for each timeseries Returns - lib_files (list of LibFile objects) ''' if "level" not in X.columns: X.insert(0, "level", y) X.sort_values("level") labels = y.unique().tolist() class_num = 0 lib_files = [] for label_ in labels: df = X.loc[X.level == label_] df = df.drop("level", 1) fname = self.get_unique_name(True) write_series(input_data=df, file_dir=self.__file_dir, filename=fname) lib_files.append(LibFile(class_num, label_, fname)) class_num += 1 return lib_files
def get_dm(self, quantized, first_run, max_len=None, \ num_get_dms=5, details=False): ''' Helper function: Calls bin/smash to compute the distance matrix on the given input timeseries and write I/O files necessary for Data Smashing Inputs - max_len (int): max length of data to use num_get_dms (int): number of runs of Smash to compute distance matrix (refines results) details (boolean): do (True) or do not (False) show cpu usage of Data Smashing algorithm Outuputs - (numpy.ndarray) distance matrix of the input timeseries (shape n_samples x n_samples) ''' if not first_run: os.unlink(self.__input_dm_fh.name) self.__command = (self.__bin_path + "/smash") if not quantized: self.__input_dm_fh, self.__input_dm_fname = write_series(input_data=self._data,\ file_dir=self.__file_dir) else: self.__input_dm_fh, self.__input_dm_fname = write_series(input_data=self.__quantized_data,\ file_dir=self.__file_dir) self.__command += " -f " + self.__input_dm_fname + " -D row -T symbolic" if max_len is not None: self.__command += (" -L " + str(max_len)) if num_get_dms is not None: self.__command += (" -n " + str(num_get_dms)) if not details: self.__command += (" -t 0") self.__output_dm_fname = str(uuid.uuid4()) self.__output_dm_fname = self.__output_dm_fname.replace("-", "") self.__command += (" -o " + self.__output_dm_fname) prev_wd = os.getcwd() os.chdir(self.__file_dir) sp.Popen(self.__command, shell=True, stderr=sp.STDOUT).wait() os.chdir(prev_wd) try: results = np.loadtxt(fname=(self.__file_dir + "/" + self.__output_dm_fname), dtype=float) return results except IOError: print "Error: Smash calculation unsuccessful. Please try again."
def compute(self, X, input_length, num_repeats, no_details, force): ''' Helper method: Calls SmashMatch on the specified data with the parameters specified; creates command string for SmashMatch binary and calls using SmashMatch; udpates internal variables Inputs - X (numpy.ndarray or pandas DataFrame): input data (each row is a different timeseries) input_length (int): length of the input timeseries to use num_repeats (int): number of times to run SmashMatch (for refining results) no_details (boolean): do not print SmashMatch processer usage and speed while running classification force (boolean): force re-classification on current dataset Outputs - (boolean) whether SmashMatch results corresponding to X were created/exist ''' if force or self.should_calculate( X): # dataset was not the same as before or first run if force: self.reset_input() if isinstance(X, np.ndarray): self.__input = X input_name_command = " -f " + self.write_series_nda(X) elif isinstance(X, pd.DataFrame): # being explicit self.__input = X fname = self.get_unique_name(False) write_series(input_data=X, file_dir=self.__file_dir, filename=fname) input_name_command = " -f " + fname else: # theoretically should be impossible, but to be explicit raise ValueError( "Error: unsupported types for X. X can only be of type \ numpy.ndarray or pandas.DataFrame.") if input_length is not None: input_length_command = " -x " + str(input_length) self.__command += (input_name_command + self.__lib_command + "-T symbolic -D row ") self.__command += ("-L true true true -o " + PREFIX + " -d false") self.__command += (" -n " + str(num_repeats)) if input_length is not None: self.__command += input_length_command if no_details: self.__command += " -t 0" os.chdir(self.__file_dir) sp.Popen(self.__command, shell=True, stderr=sp.STDOUT).wait() os.chdir(CWD) if not self.has_smashmatch( ): # should theoretically be impossible \ # to return False, but for safety return False else: # successfully ran smashmatch to get results return True else: # dataset was the same as before, use existing result files return True