def _merge_pickle_files(self, target_collection_path, source_collection_pathes, train_set_name_suffix, target_collection_params): """ Merge all collections in source_collection_pathes and store them \ in the target collection""" # load a first collection, in which the data of all other collections # is assembled target_collection = BaseDataset.load(source_collection_pathes[0]) author = get_author() date = time.strftime("%Y%m%d_%H_%M_%S") # Delete node_chain file name try: target_collection.meta_data.pop("node_chain_file_name") except: pass # Update meta data and store it k = "test" if self.reverse else "train" target_collection_params["__INPUT_DATASET__"][k] = \ [s_c_p.split(os.sep)[-2] for s_c_p in source_collection_pathes] target_collection_params[ "__RESULT_DIRECTORY__"] = self.result_directory target_collection.meta_data.update({ "author": author, "date": date, "dataset_directory": target_collection_path, "train_test": True, "parameter_setting": target_collection_params, "input_collection_name": source_collection_pathes[0][len(pySPACE.configuration.storage):] }) # merge data of all other collections to target collection for source_collection_path in source_collection_pathes[1:]: source_collection = BaseDataset.load(source_collection_path) for run in source_collection.get_run_numbers(): for split in source_collection.get_split_numbers(): data = source_collection.get_data(run, split, train_set_name_suffix) target_data = target_collection.get_data( run, split, train_set_name_suffix) # actual data is stored in a list that has to be extended target_data.extend(data) # if only test data was given, the "Rest_vs" collection is stored as # training data if not self.reverse and "test" == train_set_name_suffix: # exchange the "test" in key tuple to "train" before storing for key in target_collection.data.keys(): assert ("test" == key[2]) value = target_collection.data.pop(key) key = (key[0], key[1], "train") target_collection.data[key] = value target_collection.store(target_collection_path)
def _merge_pickle_files(self, target_collection_path, source_collection_pathes, train_set_name_suffix, target_collection_params): """ Merge all collections in source_collection_pathes and store them \ in the target collection""" # load a first collection, in which the data of all other collections # is assembled target_collection = BaseDataset.load(source_collection_pathes[0]) author = get_author() date = time.strftime("%Y%m%d_%H_%M_%S") # Delete node_chain file name try: target_collection.meta_data.pop("node_chain_file_name") except: pass # Update meta data and store it k = "test" if self.reverse else "train" target_collection_params["__INPUT_DATASET__"][k] = \ [s_c_p.split(os.sep)[-2] for s_c_p in source_collection_pathes] target_collection_params["__RESULT_DIRECTORY__"] = self.result_directory target_collection.meta_data.update({ "author" : author, "date" : date, "dataset_directory" : target_collection_path, "train_test" : True, "parameter_setting" : target_collection_params, "input_collection_name" : source_collection_pathes[0][len( pySPACE.configuration.storage):] }) # merge data of all other collections to target collection for source_collection_path in source_collection_pathes[1:]: source_collection = BaseDataset.load(source_collection_path) for run in source_collection.get_run_numbers(): for split in source_collection.get_split_numbers(): data = source_collection.get_data(run, split, train_set_name_suffix) target_data = target_collection.get_data(run, split, train_set_name_suffix) # actual data is stored in a list that has to be extended target_data.extend(data) # if only test data was given, the "Rest_vs" collection is stored as # training data if not self.reverse and "test" == train_set_name_suffix: # exchange the "test" in key tuple to "train" before storing for key in target_collection.data.keys(): assert("test" == key[2]) value = target_collection.data.pop(key) key = (key[0],key[1],"train") target_collection.data[key] = value target_collection.store(target_collection_path)
def store(self, result_dir, s_format = "None"): if not s_format == "None": self._log("The format %s is not supported!"%s_format, level=logging.CRITICAL) return # Update the meta data author = get_author() self.update_meta_data({"type": "only output of individual nodes stored", "storage_format": s_format, "author" : author, "data_pattern": "no data stored"}) # Store meta data BaseDataset.store_meta_data(result_dir,self.meta_data)
def store(self, result_dir, s_format="None"): if not s_format == "None": self._log("The format %s is not supported!" % s_format, level=logging.CRITICAL) return # Update the meta data author = get_author() self.update_meta_data({ "type": "only output of individual nodes stored", "storage_format": s_format, "author": author, "data_pattern": "no data stored" }) # Store meta data BaseDataset.store_meta_data(result_dir, self.meta_data)
def store(self, result_dir, s_format="pickle"): """ Stores this collection in the directory *result_dir*. In contrast to *dump* this method stores the collection not in a single file but as a whole directory structure with meta information etc. The data sets are stored separately for each run, split, train/test combination. **Parameters** :result_dir: The directory in which the collection will be stored. :name: The prefix of the file names in which the individual data sets are stored. The actual file names are determined by appending suffixes that encode run, split, train/test information. (*optional, default: "time_series"*) :s_format: The format in which the actual data sets should be stored. Possible formats are 'pickle', 'text', 'csv' and 'mat' (matlab) format. If s_format is a list, the second element further specifies additional options for storing. - pickle: Standard Python format - text: In the text format, all time series objects are concatenated to a single large table containing only integer values. - csv: For the csv format comma separated values are taken as default or a specified Python format string. - mat: Scipy's savemat function is used for storing. Thereby the data is stored as 3 dimensional array. Also meta data information, like sampling frequency and channel names are saved. As an additional parameter the orientation of the data arrays can be given as 'channelXtime' or 'timeXchannel' .. note:: For the text and MATLAB format, markers could be added by using a Marker_To_Mux node before (*optional, default: "pickle"*) .. todo:: Put marker to the right time point and also write marker channel. .. todo:: Shouldn't be 'text' and 'csv' format part of the stream data set?! """ name = "time_series" # for some storage procedures we need further specifications s_type = None if type(s_format) == list: # file format is first position f_format = s_format[0] if len(s_format) > 1: s_type = s_format[1] else: f_format = s_format if f_format == "text" and s_type is None: s_type = "%i" elif f_format == "csv" and s_type == "real": s_type = "%.18e" # Update the meta data author = get_author() self.update_meta_data({"type": "time_series", "storage_format": s_format, "author": author, "data_pattern": "data_run" + os.sep + name + "_sp_tt." + f_format}) # Iterate through splits and runs in this dataset for key, time_series in self.data.iteritems(): # load data, if necessary # (due to the lazy loading, the data might be not loaded already) if isinstance(time_series, basestring): time_series = self.get_data(key[0], key[1], key[2]) if self.sort_string is not None: time_series.sort(key=eval(self.sort_string)) # Construct result directory result_path = result_dir + os.sep + "data" + "_run%s" % key[0] if not os.path.exists(result_path): os.mkdir(result_path) key_str = "_sp%s_%s" % key[1:] # Store data depending on the desired format if f_format in ["pickle", "cpickle", "cPickle"]: result_file = open(os.path.join(result_path, name+key_str+".pickle"), "w") cPickle.dump(time_series, result_file, cPickle.HIGHEST_PROTOCOL) result_file.close() elif f_format in ["text","csv"]: self.update_meta_data({ "type": "stream", "marker_column": "marker"}) result_file = open(os.path.join(result_path, name + key_str + ".csv"), "w") csvwriter = csv.writer(result_file) channel_names = copy.deepcopy(time_series[0][0].channel_names) if f_format == "csv": channel_names.append("marker") csvwriter.writerow(channel_names) for (data, key) in time_series: if f_format == "text": numpy.savetxt(result_file, data, delimiter=",", fmt=s_type) if not key is None: result_file.write(str(key)) result_file.flush() elif data.marker_name is not None \ and len(data.marker_name) > 0: result_file.write(str(data.marker_name)) result_file.flush() else: first_line = True marker = "" if not key is None: marker = str(key) elif data.marker_name is not None \ and len(data.marker_name) > 0: marker = str(data.marker_name) for line in data: l = list(line) l.append(marker) csvwriter.writerow(list(l)) if first_line: first_line = False marker = "" result_file.flush() result_file.close() elif f_format in ["matlab", "mat", "MATLAB"]: # todo: handle all the other attributes of ts objects! import scipy.io result_file_name = os.path.join(result_path, name + key_str + ".mat") # extract a first time series object to get meta data ts1 = time_series[0][0] # collect all important information in the collection_object dataset_dict = { "sampling_frequency": ts1.sampling_frequency, "channel_names": ts1.channel_names} # we have to extract the data and labels separatly if 'channelXtime' in s_format: dataset_dict["data"] = [data.T for data, _ in time_series] else: dataset_dict["data"] = [data for data, _ in time_series] dataset_dict["labels"] = [label for _, label in time_series] # construct numpy 3d array (e.g., channelXtimeXtrials) dataset_dict["data"] = numpy.rollaxis(numpy.array( dataset_dict["data"]), 0, 3) scipy.io.savemat(result_file_name, mdict=dataset_dict) elif f_format in ["bp_eeg"]: result_file = open(os.path.join(result_path, name + key_str + ".eeg"),"a+") result_file_mrk = open(os.path.join(result_path, name + key_str + ".vmrk"),"w") result_file_mrk.write("Brain Vision Data Exchange Marker File, " "Version 1.0\n") result_file_mrk.write("; Data stored by pySPACE\n") result_file_mrk.write("[Common Infos]\n") result_file_mrk.write("Codepage=UTF-8\n") result_file_mrk.write("DataFile=%s\n" % str(name + key_str + ".eeg")) result_file_mrk.write("\n[Marker Infos]\n") markerno = 1 datapoint = 1 sf = None channel_names = None for t in time_series: if sf is None: sf = t[0].sampling_frequency if channel_names is None: channel_names = t[0].get_channel_names() for mrk in t[0].marker_name.keys(): for tm in t[0].marker_name[mrk]: result_file_mrk.write(str("Mk%d=Stimulus,%s,%d,1,0\n" % (markerno, mrk, datapoint+(tm*sf/1000.0)))) markerno += 1 data_ = t[0].astype(numpy.int16) data_.tofile(result_file) datapoint += data_.shape[0] result_hdr = open(os.path.join(result_path, name + key_str + ".vhdr"),"w") result_hdr.write("Brain Vision Data Exchange Header " "File Version 1.0\n") result_hdr.write("; Data stored by pySPACE\n\n") result_hdr.write("[Common Infos]\n") result_hdr.write("Codepage=UTF-8\n") result_hdr.write("DataFile=%s\n" % str(name + key_str + ".eeg")) result_hdr.write("MarkerFile=%s\n" % str(name + key_str + ".vmrk")) result_hdr.write("DataFormat=BINARY\n") result_hdr.write("DataOrientation=MULTIPLEXED\n") result_hdr.write("NumberOfChannels=%d\n" % len(channel_names)) result_hdr.write("SamplingInterval=%d\n\n" % (1000000/sf)) result_hdr.write("[Binary Infos]\n") result_hdr.write("BinaryFormat=INT_16\n\n") result_hdr.write("[Channel Infos]\n") # TODO: Add Resolutions to time_series # 0 = 0.1 [micro]V, # 1 = 0.5 [micro]V, # 2 = 10 [micro]V, # 3 = 152.6 [micro]V (seems to be unused!) resolutions_str = [unicode("0.1,%sV" % unicode(u"\u03BC")), unicode("0.5,%sV" % unicode(u"\u03BC")), unicode("10,%sV" % unicode(u"\u03BC")), unicode("152.6,%sV" % unicode(u"\u03BC"))] for i in range(len(channel_names)): result_hdr.write(unicode("Ch%d=%s,,%s\n" % (i+1,channel_names[i], unicode(resolutions_str[0]))).encode('utf-8')) result_file.close() else: NotImplementedError("Using unavailable storage format:%s!" % f_format) self.update_meta_data({ "channel_names": copy.deepcopy(time_series[0][0].channel_names), "sampling_frequency": time_series[0][0].sampling_frequency }) #Store meta data BaseDataset.store_meta_data(result_dir, self.meta_data)
def store(self, result_dir, s_format=["pickle", "real"]): """ store the collection in *result_dir*""" name = "predictions" # Update the meta data author = get_author() self.update_meta_data({ "type": "prediction_vector", "storage_format": s_format, "author": author, "data_pattern": "data_run" + os.sep + name + "_sp_tt." + s_format[0] }) if not s_format in ["csv", "arff", "pickle"]: self._log("Storage format not supported! Using default.", level=logging.ERROR) s_format = "pickle" for key, prediction_vectors in self.data.iteritems(): # Construct result directory result_path = result_dir + os.sep + "data" \ + "_run%s" % key[0] if not os.path.exists(result_path): os.mkdir(result_path) key_str = "_sp%s_%s" % key[1:] # Store data depending on the desired format if s_format == "pickle": result_file = open( os.path.join(result_path, name + key_str + ".pickle"), "w") cPickle.dump(prediction_vectors, result_file, cPickle.HIGHEST_PROTOCOL) elif s_format == "csv": # Write as Comma Separated Value result_file = open( os.path.join(result_path, name + key_str + ".csv"), "w") if self.meta_data["num_predictors"] == 1: result_file.write( "Predicted Label, Prediction Score, True Label \n") for pv in prediction_vectors: result_file.write( "%s, %s, %s\n" % (pv[0].label[0], pv[0].prediction[0], pv[1])) else: # we begin by dealing with the header of the csv file base_header = "Predicted %(index)d Label, Prediction %(index)d Score, " base_result = "%(label)s, %(score)s," header = "" for i in range(self.meta_data["num_predictors"]): header += base_header % dict(index=i + 1) header += "True Label\n" result_file.write(header) # and now we can write each of the prediction vectors in turn for pv in prediction_vectors: result = "" for i in range(self.meta_data["num_predictors"]): result += base_result % dict( label=pv[0].label[i], score=pv[0].prediction[i]) result += str(pv[1]) + "\n" result_file.write(result) #Store meta data BaseDataset.store_meta_data(result_dir, self.meta_data)
def store(self, result_dir, s_format=["pickle", "real"]): """ Stores this collection in the directory *result_dir*. In contrast to *dump* this method stores the collection not in a single file but as a whole directory structure with meta information etc. The data sets are stored separately for each run, split, train/test combination. The method expects the following parameters: * *result_dir* The directory in which the collection will be stored * *name* The prefix of the file names in which the individual \ data sets are stored. The actual file names are determined \ by appending suffixes that encode run, split, train/test \ information. Defaults to "features". * *format* A list with information about the format in which the actual data sets should be stored. The first entry specifies the file format. If it is "arff" the second entry specifies the attribute format. Examples: ["arff", "real"], ["arff", "{0,1}"] .. todo:: Someone could implement the format ["fasta"] for sax features To store the data in comma separated values, use ["csv", "real"]. (*optional, default: ["pickle", "real"]*) .. todo:: Adapt storing of csv file to external library instead of doing it manually. """ name = "features" # Update the meta data author = get_author() self.update_meta_data({ "type": "feature_vector", "storage_format": s_format, "author": author, "data_pattern": "data_run" + os.sep + name + "_sp_tt." + s_format[0] }) if type(s_format) == list: s_type = s_format[1] s_format = s_format[0] else: s_type = "real" if not s_format in ["csv", "arff", "pickle"]: self._log("Storage format not supported! Using default.", level=logging.ERROR) s_format = "pickle" # Iterate through splits and runs in this dataset for key, feature_vectors in self.data.iteritems(): # test if dataset has already been loaded. # Otherwise replace with iterator to loaded version. if isinstance(feature_vectors, basestring): feature_vectors = self.get_data(key[0], key[1], key[2]) # Construct result directory result_path = result_dir + os.sep + "data" \ + "_run%s" % key[0] if not os.path.exists(result_path): os.mkdir(result_path) key_str = "_sp%s_%s" % key[1:] # Store data depending on the desired format if s_format == "pickle": result_file = open( os.path.join(result_path, name + key_str + ".pickle"), "w") cPickle.dump(feature_vectors, result_file, cPickle.HIGHEST_PROTOCOL) elif s_format == "arff": # Write as ARFF result_file = open( os.path.join(result_path, name + key_str + ".arff"), "w") # Create the arff file header relation_name = result_dir.split(os.sep)[-1] result_file.write('@relation "%s"\n' % relation_name) # Write the type of all features for feature_name in self.meta_data["feature_names"]: result_file.write("@attribute %s %s\n" % (feature_name, s_type)) classString = "" + ",".join( sorted(self.meta_data["classes_names"])) + "" result_file.write("@attribute class {%s}\n" % classString) result_file.write("@data\n") # Write all given training data into the ARFF file fv = feature_vectors[0][0] if numpy.issubdtype(fv.dtype, numpy.string_): feature_format = "%s," elif numpy.issubdtype(fv.dtype, numpy.floating): feature_format = "%f," elif numpy.issubdtype(fv.dtype, numpy.integer): feature_format = "%d," for features, class_name in feature_vectors: for feature in features[0]: result_file.write(feature_format % feature) result_file.write("%s\n" % str(class_name)) elif s_format == "csv": # Write as Comma Separated Value result_file = open( os.path.join(result_path, name + key_str + ".csv"), "w") for feature_name in self.meta_data["feature_names"]: result_file.write("%s," % (feature_name)) result_file.write("\n") fv = feature_vectors[0][0] if numpy.issubdtype(fv.dtype, numpy.floating): feature_format = "%f," elif numpy.issubdtype(fv.dtype, numpy.integer): feature_format = "%d," else: feature_format = "%s," for features, class_name in feature_vectors: f = features.view(numpy.ndarray) for feature in f[0]: result_file.write(feature_format % feature) result_file.write("%s\n" % str(class_name)) result_file.close() #Store meta data BaseDataset.store_meta_data(result_dir, self.meta_data)
def _merge_files(self, target_collection_path, source_collection_pathes, train_set_name_suffix, target_collection_params): """ Merge all collections in source_collection_pathes and store them \ in the target collection **Parameters** :target_collection_path: Path of the dataset, in which the data of all other datasets is assembled. :source_collection_pathes: Paths of the datasets to be merged. :train_set_name_suffix: Either 'train' or 'test'. Specifies if datasets are merged for training or testing. :target_collection_params: Dictionary with all the parameters of the target dataset. """ # load a first collection, in which the data of all other collections # is assembled target_collection = BaseDataset.load(source_collection_pathes[0]) author = get_author() date = time.strftime("%Y%m%d_%H_%M_%S") # Delete node_chain file name try: target_collection.meta_data.pop("node_chain_file_name") except: pass # Update meta data and store it k = "test" if self.reverse else "train" target_collection_params["__INPUT_DATASET__"][k] = \ [s_c_p.split(os.sep)[-2] for s_c_p in source_collection_pathes] target_collection_params["__RESULT_DIRECTORY__"] = self.result_directory target_collection.meta_data.update({ "author" : author, "date" : date, "dataset_directory" : target_collection_path, "train_test" : True, "parameter_setting" : target_collection_params, "input_collection_name" : source_collection_pathes[0][len( pySPACE.configuration.storage):] }) # merge data of all other collections to target collection for source_collection_path in source_collection_pathes[1:]: source_collection = BaseDataset.load(source_collection_path) for run in source_collection.get_run_numbers(): for split in source_collection.get_split_numbers(): target_data = target_collection.get_data(run, split, train_set_name_suffix) if self.set_flag: for ts, l in target_data: if ts.specs == None: ts.specs = {"new_set": False} elif ts.specs.has_key("new_set"): break else: ts.specs["new_set"]= False data = source_collection.get_data(run, split, train_set_name_suffix) if self.set_flag: for i, (ts, l) in enumerate(data): # flag first element of the concatenated data list if ts.specs == None: ts.specs = {"new_set": i==0} else: ts.specs["new_set"] = (i==0) # actual data is stored in a list that has to be extended target_data.extend(data) # if only test data was given, the "Rest_vs" collection is stored as # training data if not self.reverse and "test" == train_set_name_suffix: # exchange the "test" in key tuple to "train" before storing for key in target_collection.data.keys(): assert("test" == key[2]) value = target_collection.data.pop(key) key = (key[0],key[1],"train") target_collection.data[key] = value # we store the data in the same format as before target_collection.store(target_collection_path, target_collection.meta_data["storage_format"])
def store(self, result_dir, s_format = "BrainVision"): self.merged = False scale = 10.0 # is used to scale up the eeg sample values. The data samples are converted to int16 # when saving, so scaling is necessary to keep maintain the resolutions. # Keep original file name, depends on the AnalyserSinkNode, see it's documentation. if self.meta_data.has_key('eeg_src_file_name') and self.meta_data['eeg_src_file_name'] is not None: name = self.meta_data['eeg_src_file_name'] # or use default name from this collection else: name = "Analyzer" if not s_format == "BrainVision": self._log("The format %s is not supported!"%s_format, level=logging.CRITICAL) return # Update the meta data author = get_author() self.update_meta_data({"type": "only output of individual nodes stored", "storage_format": s_format, "author" : author, "data_pattern": "Multiplexed"}) # Store meta data BaseDataset.store_meta_data(result_dir,self.meta_data) #self._log("EEG data file %s" % self.collection.data_file) slices = [] slices.append(0) channel_names = [] for key, time_series in self.data.iteritems(): # Sort the Times-Series Array def cmp_start(a, b): return cmp(a[0].start_time, b[0].start_time) time_series.sort(cmp_start) # Check for overlapping Windows and remove them if existent i = 0 while i < len(time_series): ts = time_series[i] #print ts[0].start_time, ts[0].end_time #print len(time_series) if ts[0].start_time >= slices[-1]: slices.append(ts[0].end_time) else: warnings.warn("Ignoring at least one overlapping window!", UserWarning) i = i+1 # STORE ACTUAL EEG DATA AND WRITE MARKERFILE result_path = result_dir + os.sep + "data_analyzer" \ + "_run%s" % key[0] if not os.path.exists(result_path): os.mkdir(result_path) key_str = "_sp%s_%s" % key[1:] # Keep original name if (self.meta_data.has_key('eeg_src_file_name') and self.meta_data['eeg_src_file_name'] != None): result_file_eeg = open(os.path.join(result_path, name + ".eeg"), "wb") result_file_mrk = open(os.path.join(result_path, name + ".vmrk"), "w") # or use default name from this collection else: result_file_eeg = open(os.path.join(result_path, name + key_str + ".eeg"), "wb") result_file_mrk = open(os.path.join(result_path, name + key_str + ".vmrk"), "w") # Write Marker header if (self.meta_data.has_key('eeg_src_file_name') and self.meta_data['eeg_src_file_name'] != None): result_file_mrk.write(header_mrk % (name)) else: result_file_mrk.write(header_mrk % (name + key_str)) result_file_ms = 0 # Data for padding padding = None count_mrk = 2 num_ch = 0 sampling_int = 0 for ts in time_series: ts0 = ts[0] * scale ts0 = ts0.astype(numpy.int16) if padding == None: padding = numpy.zeros(len(ts[0].channel_names), dtype='int16') num_ch = len(ts[0].channel_names) channel_names = ts[0].channel_names sampling_int = 1000000/ts[0].sampling_frequency #print "writing %d channels.." % len(ts[0].channel_names) # Write Padding (zeros) while result_file_ms < ts[0].start_time - sampling_int/1000.0: result_file_eeg.write(padding.tostring()) result_file_ms += ts[0]._samples_to_ms(1) # Write window ts0.tofile(result_file_eeg) result_file_ms += ts[0].end_time - (ts[0].start_time - sampling_int/1000.0) # Write Marker markers = [] if(len(ts[0].marker_name) > 0): mk_keys = ts[0].marker_name.keys() mk_values = ts[0].marker_name.values() for mk in range(len(mk_keys)): for mv in range(len(mk_values[mk])): markers.append((mk_keys[mk], mk_values[mk][mv])) markers = sorted(markers, key=lambda tup: tup[1]) for i in range(len(markers)): if 'R' in markers[i][0]: event_type = 'Response' elif 'S' in markers[i][0]: event_type = 'Stimulus' else: event_type = 'Label' result_file_mrk.write("Mk%d=%s,%s,%d,1,0\n" % (count_mrk, event_type, markers[i][0], (ts[0].start_time + markers[i][1])*ts[0].sampling_frequency/1000.0)) count_mrk += 1 # WRITE HEADERFILE # Keep original name if (self.meta_data.has_key('eeg_src_file_name') and self.meta_data['eeg_src_file_name'] != None): result_file_hdr = open(os.path.join(result_path, name + ".vhdr"), "w") result_file_hdr.write(header_hdr % ((name), (name), num_ch, sampling_int)) # or use default name from this collection else: result_file_hdr = open(os.path.join(result_path, name + key_str + ".vhdr"), "w") result_file_hdr.write(header_hdr % ((name + key_str), (name + key_str), num_ch, sampling_int)) # Format: Ch1=Fp1,,0.1,\xB5V for i in range(num_ch): result_file_hdr.write("Ch%d=%s,,%.2f,\xB5V\n" % (i+1,channel_names[i], 1./scale)) result_file_hdr.close() result_file_eeg.close() result_file_mrk.close()
def consolidate(self): """ Consolidates the results obtained by the single WEKA filter processes into a consistent summary of datasets that is stored on the file system. .. todo:: Some of the contents of this method should go into the :class:`~pySPACE.resources.dataset_defs.feature_vector.FeatureVectorDataset` """ # Iterate over all collections and store the collection meta data etc. for entries in os.listdir(self.result_directory): fullpath = os.path.join(self.result_directory, entries) # For each collection if os.path.isdir(fullpath): if entries.startswith("{"): # Extract the parameters from the collection name in order to # adjust the relation name if self.num_parameters > 0: parameter_strings = entries.strip("}{").split("}{")[-self.num_parameters:] parameter_postfix = "{" + "}{".join(parameter_strings) + "}" else: parameter_strings = "" parameter_postfix = "" # Postprocessing of the arff files of this collection for train_arff_file in glob.glob(fullpath + os.sep + "data_run*" + os.sep + "*train.arff"): # Adjust the relation name of the train file content = open(train_arff_file, 'r').readlines() # We strip everything after the last "}" endindex = content[0].rfind("}") content[0] = content[0][:endindex+1] content[0] += parameter_postfix + "'" open(train_arff_file, 'w').writelines(content) # Use relation name of train data for test data test_arff_file = train_arff_file.replace("train.arff", "test.arff") test_content = open(test_arff_file, 'r').readlines() test_content[0] = content[0] + "\n" open(test_arff_file, 'w').writelines(test_content) # Check which features are contained in the arff file feature_names = [] for line in content: if line.startswith("@attribute"): attribute = line.split()[1] if attribute is not "class": feature_names.append(attribute) # Store the collection meta data etc. if self.num_parameters > 0: input_collection_name = \ "{" + "}{".join(entries.strip("}{").split("}{")[:-self.num_parameters]) + "}" else: input_collection_name = entries input_collection_path = os.path.join(self.operation_spec["input_path"], input_collection_name) input_collection_meta = BaseDataset.load_meta_data( pySPACE.configuration.storage + os.sep + input_collection_path) # Store the input collection BaseDataset.store_meta_data(fullpath, input_collection_meta, file_name="input_metadata.yaml") # Adjust collection metadata for the new collection input_collection_meta["feature_names"] = feature_names input_collection_meta["num_features"] = len(feature_names) input_collection_meta["author"] = get_author() input_collection_meta["date"] = time.strftime("%Y%m%d") input_collection_meta["input_collection_name"] = input_collection_name # Write the collection meta information into the folder BaseDataset.store_meta_data(fullpath,input_collection_meta) # Store the command_template command_template_file = open(os.path.join(fullpath, "command_template"), 'w') command_template_file.write(self.command_template) command_template_file.close() else: # training and test arff need the same relation name # otherwise Weka can't relate it to each other; the collection # name and the parameters in {}{}-optic must be the relation # name for further processing self._log("WARNING: Collection name doesn't begin with '{'. Further processing may be collapse!", level= logging.WARNING) # Write the specification of this operation # to the result directory in order to make later # analysis of results more easy source_operation_file = open(os.path.join(self.result_directory, "source_operation.yaml"), 'w') yaml.dump(self.operation_spec, source_operation_file) source_operation_file.close()
def consolidate(self, _=None): """ Consolidates the results obtained by the single processes into a consistent structure of collections that are stored on the file system. """ # Consolidate the results directory_pattern = os.sep.join([ self.result_directory, "{*", ]) dataset_pathes = glob.glob(directory_pattern) # For all collections found for dataset_path in dataset_pathes: try: # Load their meta_data meta_data = BaseDataset.load_meta_data(dataset_path) # Determine author and date author = get_author() date = time.strftime("%Y%m%d_%H_%M_%S") # Update meta data and store it meta_data.update({"author": author, "date": date}) # There can be either run dirs, persistency dirs, or both of them. # Check of whichever there are more. If both exist, their numbers # are supposed to be equal. nr_run_dirs = len( glob.glob(os.path.join(dataset_path, "data_run*"))) nr_per_dirs = len( glob.glob(os.path.join(dataset_path, "persistency_run*"))) nr_runs = max(nr_run_dirs, nr_per_dirs) if nr_runs > 1: meta_data["runs"] = nr_runs # Store the metadata BaseDataset.store_meta_data(dataset_path, meta_data) # Copy the input dataset specification file to the result # directory in order to make later analysis of # the results more easy # THA: Split the first "/" from the input collection name, because otherwise it will be treated # as an absolute path input_collection_name = meta_data["input_collection_name"][1:] if \ meta_data["input_collection_name"][0] == os.sep else meta_data["input_collection_name"] input_meta_path = os.path.join(pySPACE.configuration.storage, input_collection_name) try: input_meta = BaseDataset.load_meta_data(input_meta_path) BaseDataset.store_meta_data( dataset_path, input_meta, file_name="input_metadata.yaml") except (IOError, OSError) as e: self._log("Error copying the input_metadata.yaml: {error}". format(error=e.message), level=logging.CRITICAL) except Exception as e: logging.getLogger("%s" % self).exception( "Error updating the metadata: {error!s}".format(error=e)) raise e # If we don't create a feature vector or time series collection, # we evaluated our classification using a classification performance sink. # The resulting files should be merged to one csv tabular. pathlist = glob.glob(os.path.join(self.result_directory, "results_*")) if len(pathlist) > 0: # Do the consolidation the same way as for WekaClassificationOperation self._log("Consolidating results ...") # We load and store the results once into a PerformanceResultSummary # This does the necessary consolidation... self._log("Reading intermediate results...") try: result_collection = PerformanceResultSummary( dataset_dir=self.result_directory) self._log("done") self._log("Storing result collection") result_collection.store(self.result_directory) self._log("done") PerformanceResultSummary.merge_traces(self.result_directory) except Exception as e: logging.getLogger("%s" % self).exception( "Error merging the result collection: {error!s}".format( error=e)) if self.compression: # Since we get one result summary, # we don't need the numerous folders. # So we zip them to make the whole folder more easy visible. import zipfile cwd = os.getcwd() os.chdir(self.result_directory) # If there are to many or to large folders, problems may occur. # This case we want to log, try 64 bit mode, # and then skip the zipping. try: pathlist = glob.glob( os.path.join(self.result_directory, "{*}")) if not self.compression == "delete": save_file = zipfile.ZipFile( self.result_directory + '/result_folders.zip', mode="w", compression=self.compression) # we want to have the zipped file relative to the # result directory for path in pathlist: for node in os.walk(path): rel_path = os.path.relpath( node[0], self.result_directory) save_file.write(rel_path) for data in node[2]: save_file.write( os.path.join(rel_path, data)) save_file.close() # To still have an easy access to the history of the # processing, we keep one folder. pathlist.pop() for path in pathlist: shutil.rmtree(path) except Exception, e: self._log("Result files could not be compressed with 32" + " bit mode, switching to 64 bit mode", level=logging.CRITICAL) # nearly total code copy, only difference with 64 bit mode try: pathlist = glob.glob( os.path.join(self.result_directory, "{*}")) save_file = zipfile.ZipFile( self.result_directory + '/result_folders.zip', mode="w", compression=self.compression, allowZip64=True) # we want to have the zipped file relative to the # result directory for path in pathlist: for node in os.walk(path): rel_path = os.path.relpath( node[0], self.result_directory) save_file.write(rel_path) for data in node[2]: save_file.write( os.path.join(rel_path, data)) save_file.close() # To still have an easy access to the history of the # processing, we keep one folder. pathlist.pop() for path in pathlist: shutil.rmtree(path) except: self._log( "64 bit mode also failed. Please check your files and your code or contact your local programmer!", level=logging.CRITICAL) os.chdir(cwd)
def store(self, result_dir, s_format=["pickle", "real"]): """ store the collection in *result_dir*""" name = "predictions" # Update the meta data author = get_author() self.update_meta_data({"type": "prediction_vector", "storage_format": s_format, "author": author, "data_pattern": "data_run" + os.sep + name + "_sp_tt." + s_format[0]}) if not s_format in ["csv", "arff", "pickle"]: self._log("Storage format not supported! Using default.", level=logging.ERROR) s_format = "pickle" for key, prediction_vectors in self.data.iteritems(): # Construct result directory result_path = result_dir + os.sep + "data" \ + "_run%s" % key[0] if not os.path.exists(result_path): os.mkdir(result_path) key_str = "_sp%s_%s" % key[1:] # Store data depending on the desired format if s_format == "pickle": result_file = open(os.path.join(result_path, name + key_str + ".pickle"), "w") cPickle.dump(prediction_vectors, result_file, cPickle.HIGHEST_PROTOCOL) elif s_format == "csv": # Write as Comma Separated Value result_file = open(os.path.join(result_path, name + key_str + ".csv"),"w") if self.meta_data["num_predictors"] == 1: result_file.write("Predicted Label, Prediction Score, True Label \n") for pv in prediction_vectors: result_file.write("%s, %s, %s\n" % (pv[0].label[0], pv[0].prediction[0], pv[1])) else: # we begin by dealing with the header of the csv file base_header = "Predicted %(index)d Label, Prediction %(index)d Score, " base_result = "%(label)s, %(score)s," header = "" for i in range(self.meta_data["num_predictors"]): header+= base_header % dict(index=i+1) header += "True Label\n" result_file.write(header) # and now we can write each of the prediction vectors in turn for pv in prediction_vectors: result = "" for i in range(self.meta_data["num_predictors"]): result += base_result % dict(label=pv[0].label[i], score=pv[0].prediction[i]) result += str(pv[1]) + "\n" result_file.write(result) #Store meta data BaseDataset.store_meta_data(result_dir,self.meta_data)
def store(self, result_dir, s_format = "bp_eeg"): self.merged = False scale = 10.0 # is used to scale up the eeg sample values. The data samples are converted to int16 # when saving, so scaling is necessary to keep maintain the resolutions. # Keep original file name, depends on the AnalyserSinkNode, see it's documentation. if self.meta_data.has_key('eeg_src_file_name') and self.meta_data['eeg_src_file_name'] is not None: name = self.meta_data['eeg_src_file_name'] # or use default name from this collection else: name = "Analyzer" if not s_format == "bp_eeg": self._log("The format %s is not supported!"%s_format, level=logging.CRITICAL) return # Update the meta data author = get_author() self.update_meta_data({"type": "only output of individual nodes stored", "storage_format": s_format, "author" : author, "data_pattern": "Multiplexed"}) # Store meta data BaseDataset.store_meta_data(result_dir,self.meta_data) #self._log("EEG data file %s" % self.collection.data_file) slices = [] slices.append(0) channel_names = [] for key, time_series in self.data.iteritems(): # Sort the Times-Series Array def cmp_start(a, b): return cmp(a[0].start_time, b[0].start_time) time_series.sort(cmp_start) # Check for overlapping Windows and remove them if existent i = 0 while i < len(time_series): ts = time_series[i] #print ts[0].start_time, ts[0].end_time #print len(time_series) if ts[0].start_time >= slices[-1]: slices.append(ts[0].end_time) else: warnings.warn("Ignoring at least one overlapping window!", UserWarning) i = i+1 # STORE ACTUAL EEG DATA AND WRITE MARKERFILE result_path = result_dir + os.sep + "data_analyzer" \ + "_run%s" % key[0] if not os.path.exists(result_path): os.mkdir(result_path) key_str = "_sp%s_%s" % key[1:] # Keep original name if (self.meta_data.has_key('eeg_src_file_name') and self.meta_data['eeg_src_file_name'] != None): result_file_eeg = open(os.path.join(result_path, name + ".eeg"), "wb") result_file_mrk = open(os.path.join(result_path, name + ".vmrk"), "w") # or use default name from this collection else: result_file_eeg = open(os.path.join(result_path, name + key_str + ".eeg"), "wb") result_file_mrk = open(os.path.join(result_path, name + key_str + ".vmrk"), "w") # Write Marker header if (self.meta_data.has_key('eeg_src_file_name') and self.meta_data['eeg_src_file_name'] != None): result_file_mrk.write(header_mrk % (name)) else: result_file_mrk.write(header_mrk % (name + key_str)) result_file_ms = 0 # Data for padding padding = None count_mrk = 2 num_ch = 0 sampling_int = 0 for ts in time_series: ts0 = ts[0] * scale ts0 = ts0.astype(numpy.int16) if padding == None: padding = numpy.zeros(len(ts[0].channel_names), dtype='int16') num_ch = len(ts[0].channel_names) channel_names = ts[0].channel_names sampling_int = 1000000/ts[0].sampling_frequency #print "writing %d channels.." % len(ts[0].channel_names) # Write Padding (zeros) while result_file_ms < ts[0].start_time - sampling_int/1000.0: result_file_eeg.write(padding.tostring()) result_file_ms += ts[0]._samples_to_ms(1) # Write window ts0.tofile(result_file_eeg) result_file_ms += ts[0].end_time - (ts[0].start_time - sampling_int/1000.0) # Write Marker markers = [] if(len(ts[0].marker_name) > 0): mk_keys = ts[0].marker_name.keys() mk_values = ts[0].marker_name.values() for mk in range(len(mk_keys)): for mv in range(len(mk_values[mk])): markers.append((mk_keys[mk], mk_values[mk][mv])) markers = sorted(markers, key=lambda tup: tup[1]) for i in range(len(markers)): if 'R' in markers[i][0]: event_type = 'Response' elif 'S' in markers[i][0]: event_type = 'Stimulus' else: event_type = 'Label' result_file_mrk.write("Mk%d=%s,%s,%d,1,0\n" % (count_mrk, event_type, markers[i][0], (ts[0].start_time + markers[i][1])*ts[0].sampling_frequency/1000.0)) count_mrk += 1 # WRITE HEADERFILE # Keep original name if (self.meta_data.has_key('eeg_src_file_name') and self.meta_data['eeg_src_file_name'] != None): result_file_hdr = open(os.path.join(result_path, name + ".vhdr"), "w") result_file_hdr.write(header_hdr % ((name), (name), num_ch, sampling_int)) # or use default name from this collection else: result_file_hdr = open(os.path.join(result_path, name + key_str + ".vhdr"), "w") result_file_hdr.write(header_hdr % ((name + key_str), (name + key_str), num_ch, sampling_int)) # Format: Ch1=Fp1,,0.1,\xB5V for i in range(num_ch): result_file_hdr.write("Ch%d=%s,,%.2f,\xB5V\n" % (i+1,channel_names[i], 1./scale)) result_file_hdr.close() result_file_eeg.close() result_file_mrk.close()
def __call__(self): """ Executes this process on the respective modality """ ############## Prepare benchmarking ############## super(ShuffleProcess, self).pre_benchmarking() for dataset_dir1 in self.input_datasets: for dataset_dir2 in self.input_datasets: dataset_name1 = dataset_dir1.split(os.sep)[-2] dataset_name2 = dataset_dir2.split(os.sep)[-2] # Check if the input data is split splitted = len( glob.glob(os.sep.join([dataset_dir1, "data_run0", "*" ]))) > 1 # Check that all constraints are fulfilled for this pair of # input datasets if not all( eval( constraint_template % { 'dataset_name1': dataset_name1, 'dataset_name2': dataset_name2 }) for constraint_template in self.dataset_constraints): continue if dataset_name1 == dataset_name2: if splitted: # Copy the data os.symlink( dataset_dir1, os.sep.join([self.result_directory, dataset_name1])) continue # Determine names of the original data sets the input # datasets are based on base_dataset1 = dataset_name1.strip("}{").split("}{")[0] base_dataset2 = dataset_name2.strip("}{").split("}{")[0] # Determine target dataset name and create directory # for it mixed_base_dataset = "%s_vs_%s" % (base_dataset1, base_dataset2) target_dataset_name = dataset_name1.replace( base_dataset1, mixed_base_dataset) target_dataset_dir = os.sep.join( [self.result_directory, target_dataset_name]) create_directory(os.sep.join([target_dataset_dir, "data_run0"])) if splitted: # For each split, copy the train data from dataset 1 and # the test data from dataset 2 to the target dataset for source_train_file_name in glob.glob( os.sep.join( [dataset_dir1, "data_run0", "*_sp*_train.*"])): # TODO: We have $n$ train sets and $n$ test sets, we "metadata.yaml"])), # could use all $n*n$ combinations target_train_file_name = source_train_file_name.replace( dataset_dir1, target_dataset_dir) if source_train_file_name.endswith("arff"): self._copy_arff_file(source_train_file_name, target_train_file_name, base_dataset1, mixed_base_dataset) else: os.symlink(source_train_file_name, target_train_file_name) source_test_file_name = source_train_file_name.replace( dataset_dir1, dataset_dir2) source_test_file_name = source_test_file_name.replace( "train.", "test.") target_test_file_name = target_train_file_name.replace( "train.", "test.") if source_train_file_name.endswith("arff"): self._copy_arff_file(source_test_file_name, target_test_file_name, base_dataset2, mixed_base_dataset) else: os.symlink(source_test_file_name, target_test_file_name) else: # Use the data set from dataset 1 as training set and # the data set from dataset 2 as test data for source_train_file_name in glob.glob( os.sep.join( [dataset_dir1, "data_run0", "*_sp*_test.*"])): target_train_file_name = source_train_file_name.replace( "test.", "train.") target_train_file_name = target_train_file_name.replace( dataset_dir1, target_dataset_dir) if source_train_file_name.endswith("arff"): self._copy_arff_file(source_train_file_name, target_train_file_name, base_dataset1, mixed_base_dataset) else: os.symlink(source_train_file_name, target_train_file_name) source_test_file_name = source_train_file_name.replace( dataset_dir1, dataset_dir2) target_test_file_name = target_train_file_name.replace( "train.", "test.") if source_train_file_name.endswith("arff"): self._copy_arff_file(source_test_file_name, target_test_file_name, base_dataset2, mixed_base_dataset) else: os.symlink(source_test_file_name, target_test_file_name) # Write metadata.yaml based on input meta data input_dataset1_meta = BaseDataset.load_meta_data(dataset_dir1) output_dataset_meta = dict(input_dataset1_meta) output_dataset_meta['train_test'] = True output_dataset_meta['date'] = time.strftime("%Y%m%d_%H_%M_%S") output_dataset_meta['author'] = get_author() BaseDataset.store_meta_data(target_dataset_dir, output_dataset_meta) ############## Clean up after benchmarking ############## super(ShuffleProcess, self).post_benchmarking()
def store(self, result_dir, s_format = ["pickle", "real"]): """ Stores this collection in the directory *result_dir*. In contrast to *dump* this method stores the collection not in a single file but as a whole directory structure with meta information etc. The data sets are stored separately for each run, split, train/test combination. The method expects the following parameters: * *result_dir* The directory in which the collection will be stored * *name* The prefix of the file names in which the individual \ data sets are stored. The actual file names are determined \ by appending suffixes that encode run, split, train/test \ information. Defaults to "features". * *format* A list with information about the format in which the actual data sets should be stored. The first entry specifies the file format. If it is "arff" the second entry specifies the attribute format. Examples: ["arff", "real"], ["arff", "{0,1}"] .. todo:: Someone could implement the format ["fasta"] for sax features To store the data in comma separated values, use ["csv", "real"]. (*optional, default: ["pickle", "real"]*) .. todo:: Adapt storing of csv file to external library instead of doing it manually. """ name = "features" # Update the meta data author = get_author() self.update_meta_data({"type": "feature_vector", "storage_format": s_format, "author": author, "data_pattern": "data_run" + os.sep + name + "_sp_tt." + s_format[0]}) if type(s_format) == list: s_type = s_format[1] s_format = s_format[0] else: s_type = "real" if not s_format in ["csv", "arff", "pickle"]: self._log("Storage format not supported! Using default.", level=logging.ERROR) s_format = "pickle" # Iterate through splits and runs in this dataset for key, feature_vectors in self.data.iteritems(): # test if dataset has already been loaded. # Otherwise replace with iterator to loaded version. if isinstance(feature_vectors, basestring): feature_vectors = self.get_data(key[0], key[1], key[2]) # Construct result directory result_path = result_dir + os.sep + "data" \ + "_run%s" % key[0] if not os.path.exists(result_path): os.mkdir(result_path) key_str = "_sp%s_%s" % key[1:] # Store data depending on the desired format if s_format == "pickle": result_file = open(os.path.join(result_path, name + key_str + ".pickle"), "w") cPickle.dump(feature_vectors, result_file, cPickle.HIGHEST_PROTOCOL) elif s_format == "arff": # Write as ARFF result_file = open(os.path.join(result_path, name + key_str + ".arff"),"w") # Create the arff file header relation_name = result_dir.split(os.sep)[-1] result_file.write('@relation "%s"\n' % relation_name) # Write the type of all features for feature_name in self.meta_data["feature_names"]: result_file.write("@attribute %s %s\n" % (feature_name, s_type)) classString = "" + ",".join(sorted(self.meta_data["classes_names"])) + "" result_file.write("@attribute class {%s}\n" % classString) result_file.write("@data\n") # Write all given training data into the ARFF file fv = feature_vectors[0][0] if numpy.issubdtype(fv.dtype, numpy.string_): feature_format = "%s," elif numpy.issubdtype(fv.dtype, numpy.floating): feature_format = "%f," elif numpy.issubdtype(fv.dtype, numpy.integer): feature_format = "%d," for features, class_name in feature_vectors: for feature in features[0]: result_file.write(feature_format % feature) result_file.write("%s\n" % str(class_name)) elif s_format == "csv": # Write as Comma Separated Value result_file = open(os.path.join(result_path, name + key_str + ".csv"),"w") for feature_name in self.meta_data["feature_names"]: result_file.write("%s," % (feature_name)) result_file.write("\n") fv = feature_vectors[0][0] if numpy.issubdtype(fv.dtype, numpy.floating): feature_format = "%f," elif numpy.issubdtype(fv.dtype, numpy.integer): feature_format = "%d," else: feature_format = "%s," for features, class_name in feature_vectors: f = features.view(numpy.ndarray) for feature in f[0]: result_file.write(feature_format % feature) result_file.write("%s\n" % str(class_name)) result_file.close() #Store meta data BaseDataset.store_meta_data(result_dir,self.meta_data)
def _merge_files(self, target_collection_path, source_collection_pathes, train_set_name_suffix, target_collection_params): """ Merge all collections in source_collection_pathes and store them \ in the target collection **Parameters** :target_collection_path: Path of the dataset, in which the data of all other datasets is assembled. :source_collection_pathes: Paths of the datasets to be merged. :train_set_name_suffix: Either 'train' or 'test'. Specifies if datasets are merged for training or testing. :target_collection_params: Dictionary with all the parameters of the target dataset. """ # load a first collection, in which the data of all other collections # is assembled target_collection = BaseDataset.load(source_collection_pathes[0]) author = get_author() date = time.strftime("%Y%m%d_%H_%M_%S") # Delete node_chain file name try: target_collection.meta_data.pop("node_chain_file_name") except: pass # Update meta data and store it k = "test" if self.reverse else "train" target_collection_params["__INPUT_DATASET__"][k] = \ [s_c_p.split(os.sep)[-2] for s_c_p in source_collection_pathes] target_collection_params[ "__RESULT_DIRECTORY__"] = self.result_directory target_collection.meta_data.update({ "author": author, "date": date, "dataset_directory": target_collection_path, "train_test": True, "parameter_setting": target_collection_params, "input_dataset_name": source_collection_pathes[0][len(pySPACE.configuration.storage):] }) # merge data of all other collections to target collection for source_collection_path in source_collection_pathes[1:]: source_collection = BaseDataset.load(source_collection_path) for run in source_collection.get_run_numbers(): for split in source_collection.get_split_numbers(): target_data = target_collection.get_data( run, split, train_set_name_suffix) if self.set_flag: for ts, l in target_data: if ts.specs == None: ts.specs = {"new_set": False} elif ts.specs.has_key("new_set"): break else: ts.specs["new_set"] = False data = source_collection.get_data(run, split, train_set_name_suffix) if self.set_flag: for i, (ts, l) in enumerate(data): # flag first element of the concatenated data list if ts.specs == None: ts.specs = {"new_set": i == 0} else: ts.specs["new_set"] = (i == 0) # actual data is stored in a list that has to be extended target_data.extend(data) # if only test data was given, the "Rest_vs" collection is stored as # training data if not self.reverse and "test" == train_set_name_suffix: # exchange the "test" in key tuple to "train" before storing for key in target_collection.data.keys(): assert ("test" == key[2]) value = target_collection.data.pop(key) key = (key[0], key[1], "train") target_collection.data[key] = value # we store the data in the same format as before target_collection.store(target_collection_path, target_collection.meta_data["storage_format"])
def consolidate(self): """ Consolidates the results obtained by the single processes into a consistent structure of collections that are stored on the file system. """ # Consolidate the results directory_pattern = os.sep.join([self.result_directory, "{*"]) dataset_pathes = glob.glob(directory_pattern) # For all collections found for dataset_path in dataset_pathes: # Load their meta_data meta_data = BaseDataset.load_meta_data(dataset_path) # Determine author and date author = get_author() date = time.strftime("%Y%m%d_%H_%M_%S") # Update meta data and store it meta_data.update({"author": author, "date": date}) BaseDataset.store_meta_data(dataset_path, meta_data) # Copy the input dataset specification file to the result # directory in order to make later analysis of # the results more easy input_meta_path = os.sep.join([pySPACE.configuration.storage, meta_data["input_collection_name"]]) input_meta = BaseDataset.load_meta_data(input_meta_path) BaseDataset.store_meta_data(dataset_path, input_meta, file_name="input_metadata.yaml") # Check if some results consist of several runs # and update the meta data in this case # TODO: This is not a clean solution for dataset_dir in glob.glob(os.sep.join([self.result_directory, "*"])): if not os.path.isdir(dataset_dir): continue # There can be either run dirs, persistency dirs, or both of them. # Check of whichever there are more. If both exist, their numbers # are supposed to be equal. nr_run_dirs = len(glob.glob(os.sep.join([dataset_dir, "data_run*"]))) nr_per_dirs = len(glob.glob(os.sep.join([dataset_dir, "persistency_run*"]))) nr_runs = max(nr_run_dirs, nr_per_dirs) if nr_runs > 1: collection_meta = BaseDataset.load_meta_data(dataset_dir) collection_meta["runs"] = nr_runs BaseDataset.store_meta_data(dataset_dir, collection_meta) # If we don't create a feature vector or time series collection, # we evaluated our classification using a classification performance sink. # The resulting files should be merged to one csv tabular. pathlist = glob.glob(os.path.join(self.result_directory, "results_*")) if len(pathlist) > 0: # Do the consolidation the same way as for WekaClassificationOperation self._log("Consolidating results ...") # We load and store the results once into a PerformanceResultSummary # This does the necessary consolidation... self._log("Reading intermediate results...") result_collection = PerformanceResultSummary(dataset_dir=self.result_directory) self._log("done") self._log("Storing result collection") result_collection.store(self.result_directory) self._log("done") PerformanceResultSummary.merge_traces(self.result_directory) if self.compression: # Since we get one result summary, # we don't need the numerous folders. # So we zip them to make the whole folder more easy visible. import zipfile cwd = os.getcwd() os.chdir(self.result_directory) # If there are to many or to large folders, problems may occur. # This case we want to log, try 64 bit mode, # and then skip the zipping. try: pathlist = glob.glob(os.path.join(self.result_directory, "{*}")) if not self.compression == "delete": save_file = zipfile.ZipFile( self.result_directory + "/result_folders.zip", mode="w", compression=self.compression ) # we want to have the zipped file relative to the # result directory for path in pathlist: for node in os.walk(path): rel_path = os.path.relpath(node[0], self.result_directory) save_file.write(rel_path) for data in node[2]: save_file.write(os.path.join(rel_path, data)) save_file.close() # To still have an easy access to the history of the # processing, we keep one folder. pathlist.pop() for path in pathlist: shutil.rmtree(path) except Exception, e: self._log( "Result files could not be compressed with 32" + " bit mode, switching to 64 bit mode", level=logging.CRITICAL, ) # nearly total code copy, only difference with 64 bit mode try: pathlist = glob.glob(os.path.join(self.result_directory, "{*}")) save_file = zipfile.ZipFile( self.result_directory + "/result_folders.zip", mode="w", compression=self.compression, allowZip64=True, ) # we want to have the zipped file relative to the # result directory for path in pathlist: for node in os.walk(path): rel_path = os.path.relpath(node[0], self.result_directory) save_file.write(rel_path) for data in node[2]: save_file.write(os.path.join(rel_path, data)) save_file.close() # To still have an easy access to the history of the # processing, we keep one folder. pathlist.pop() for path in pathlist: shutil.rmtree(path) except: self._log( "64 bit mode also failed. Please check your files and your code or contact your local programmer!", level=logging.CRITICAL, ) os.chdir(cwd)
def store(self, result_dir, s_format="pickle"): """ Stores this collection in the directory *result_dir*. In contrast to *dump* this method stores the collection not in a single file but as a whole directory structure with meta information etc. The data sets are stored separately for each run, split, train/test combination. **Parameters** :result_dir: The directory in which the collection will be stored. :name: The prefix of the file names in which the individual data sets are stored. The actual file names are determined by appending suffixes that encode run, split, train/test information. (*optional, default: "time_series"*) :format: The format in which the actual data sets should be stored. Possible formats are *pickle*, *text*, *csv* and *MATLAB* (.mat) format. In the MATLAB and text format, all time series objects are concatenated to a single large table containing only integer values. For the csv format comma separated values are taken as default or a specified Python format string. The MATLAB format is a struct that contains the data, the sampling frequency and the channel names. .. note:: For the text and MATLAB format, markers could be added by using a Marker_To_Mux node before (*optional, default: "pickle"*) .. todo:: Put marker to the right time point and also write marker channel. """ name = "time_series" if type(s_format) == list: s_type = s_format[1] s_format = s_format[0] else: s_type = "%.18e" if s_format in ["text", "matlab"]: s_type = "%i" if s_format == "csv" and s_type == "real": s_type = "%.18e" # Update the meta data author = get_author() self.update_meta_data({ "type": "time_series", "storage_format": s_format, "author": author, "data_pattern": "data_run" + os.sep + name + "_sp_tt." + s_format }) # Iterate through splits and runs in this dataset for key, time_series in self.data.iteritems(): # load data, if necessary # (due to the lazy loading, the data might be not loaded already) if isinstance(time_series, basestring): time_series = self.get_data(key[0], key[1], key[2]) if self.sort_string is not None: time_series.sort(key=eval(self.sort_string)) # Construct result directory result_path = result_dir + os.sep + "data" + "_run%s" % key[0] if not os.path.exists(result_path): os.mkdir(result_path) key_str = "_sp%s_%s" % key[1:] # Store data depending on the desired format if s_format in ["pickle", "cpickle", "cPickle"]: result_file = open( os.path.join(result_path, name + key_str + ".pickle"), "w") cPickle.dump(time_series, result_file, cPickle.HIGHEST_PROTOCOL) elif s_format in ["text", "csv"]: self.update_meta_data({ "type": "stream", "marker_column": "marker" }) result_file = open( os.path.join(result_path, name + key_str + ".csv"), "w") csvwriter = csv.writer(result_file) channel_names = copy.deepcopy(time_series[0][0].channel_names) if s_format == "csv": channel_names.append("marker") csvwriter.writerow(channel_names) for (data, key) in time_series: if s_format == "text": numpy.savetxt(result_file, data, delimiter=",", fmt=s_type) if not key is None: result_file.write(str(key)) result_file.flush() elif data.marker_name is not None \ and len(data.marker_name) > 0: result_file.write(str(data.marker_name)) result_file.flush() else: first_line = True marker = "" if not key is None: marker = str(key) elif data.marker_name is not None \ and len(data.marker_name) > 0: marker = str(data.marker_name) for line in data: l = list(line) l.append(marker) csvwriter.writerow(list(l)) if first_line: first_line = False marker = "" result_file.flush() elif s_format in ["mat"]: result_file = open( os.path.join(result_path, name + key_str + ".mat"), "w") # extract a first time series object to get meta data merged_time_series = time_series.pop(0)[0] # collect all important information in the collection_object collection_object = { "sampling_frequency": merged_time_series.sampling_frequency, "channel_names": merged_time_series.channel_names } # merge all data for (data, key) in time_series: merged_time_series = numpy.vstack( (merged_time_series, data)) collection_object["data"] = merged_time_series mdict = dict() mdict[name + key_str] = collection_object import scipy.io scipy.io.savemat(result_file, mdict=mdict) elif s_format in ["eeg"]: result_file = open( os.path.join(result_path, name + key_str + ".eeg"), "a+") result_file_mrk = open( os.path.join(result_path, name + key_str + ".vmrk"), "w") result_file_mrk.write( "Brain Vision Data Exchange Marker File, " "Version 1.0\n") result_file_mrk.write("; Data stored by pySPACE\n") result_file_mrk.write("[Common Infos]\n") result_file_mrk.write("Codepage=UTF-8\n") result_file_mrk.write("DataFile=%s\n" % str(name + key_str + ".eeg")) result_file_mrk.write("\n[Marker Infos]\n") markerno = 1 datapoint = 1 sf = None channel_names = None for t in time_series: if sf is None: sf = t[0].sampling_frequency if channel_names is None: channel_names = t[0].get_channel_names() for mrk in t[0].marker_name.keys(): for tm in t[0].marker_name[mrk]: result_file_mrk.write( str("Mk%d=Stimulus,%s,%d,1,0\n" % (markerno, mrk, datapoint + (tm * sf / 1000.0)))) markerno += 1 data_ = t[0].astype(numpy.int16) data_.tofile(result_file) datapoint += data_.shape[0] result_hdr = open( os.path.join(result_path, name + key_str + ".vhdr"), "w") result_hdr.write("Brain Vision Data Exchange Header " "File Version 1.0\n") result_hdr.write("; Data stored by pySPACE\n\n") result_hdr.write("[Common Infos]\n") result_hdr.write("Codepage=UTF-8\n") result_hdr.write("DataFile=%s\n" % str(name + key_str + ".eeg")) result_hdr.write("MarkerFile=%s\n" % str(name + key_str + ".vmrk")) result_hdr.write("DataFormat=BINARY\n") result_hdr.write("DataOrientation=MULTIPLEXED\n") result_hdr.write("NumberOfChannels=%d\n" % len(channel_names)) result_hdr.write("SamplingInterval=%d\n\n" % (1000000 / sf)) result_hdr.write("[Binary Infos]\n") result_hdr.write("BinaryFormat=INT_16\n\n") result_hdr.write("[Channel Infos]\n") # TODO: Add Resolutions to time_series # 0 = 0.1 [micro]V, # 1 = 0.5 [micro]V, # 2 = 10 [micro]V, # 3 = 152.6 [micro]V (seems to be unused!) resolutions_str = [ unicode("0.1,%sV" % unicode(u"\u03BC")), unicode("0.5,%sV" % unicode(u"\u03BC")), unicode("10,%sV" % unicode(u"\u03BC")), unicode("152.6,%sV" % unicode(u"\u03BC")) ] for i in range(len(channel_names)): result_hdr.write( unicode("Ch%d=%s,,%s\n" % (i + 1, channel_names[i], unicode(resolutions_str[0]))).encode('utf-8')) else: NotImplementedError("Using unavailable storage format:%s!" % s_format) result_file.close() self.update_meta_data({ "channel_names": copy.deepcopy(time_series[0][0].channel_names), "sampling_frequency": time_series[0][0].sampling_frequency }) #Store meta data BaseDataset.store_meta_data(result_dir, self.meta_data)
def __call__(self): """ Executes this process on the respective modality """ ############## Prepare benchmarking ############## super(ShuffleProcess, self).pre_benchmarking() for dataset_dir1 in self.input_datasets: for dataset_dir2 in self.input_datasets: dataset_name1 = dataset_dir1.split(os.sep)[-2] dataset_name2 = dataset_dir2.split(os.sep)[-2] # Check if the input data is split splitted = len(glob.glob(os.sep.join([dataset_dir1, "data_run0", "*"]))) > 1 # Check that all constraints are fulfilled for this pair of # input datasets if not all(eval(constraint_template % {'dataset_name1': dataset_name1, 'dataset_name2': dataset_name2}) for constraint_template in self.dataset_constraints): continue if dataset_name1 == dataset_name2: if splitted: # Copy the data os.symlink(dataset_dir1, os.sep.join([self.result_directory, dataset_name1])) continue # Determine names of the original data sets the input # datasets are based on base_dataset1 = dataset_name1.strip("}{").split("}{")[0] base_dataset2 = dataset_name2.strip("}{").split("}{")[0] # Determine target dataset name and create directory # for it mixed_base_dataset = "%s_vs_%s" % (base_dataset1, base_dataset2) target_dataset_name = dataset_name1.replace(base_dataset1, mixed_base_dataset) target_dataset_dir = os.sep.join([self.result_directory, target_dataset_name]) create_directory(os.sep.join([target_dataset_dir, "data_run0"])) if splitted: # For each split, copy the train data from dataset 1 and # the test data from dataset 2 to the target dataset for source_train_file_name in glob.glob(os.sep.join([dataset_dir1, "data_run0", "*_sp*_train.*"])): # TODO: We have $n$ train sets and $n$ test sets, we "metadata.yaml"])), # could use all $n*n$ combinations target_train_file_name = source_train_file_name.replace(dataset_dir1, target_dataset_dir) if source_train_file_name.endswith("arff"): self._copy_arff_file(source_train_file_name, target_train_file_name, base_dataset1, mixed_base_dataset) else: os.symlink(source_train_file_name, target_train_file_name) source_test_file_name = source_train_file_name.replace(dataset_dir1, dataset_dir2) source_test_file_name = source_test_file_name.replace("train.", "test.") target_test_file_name = target_train_file_name.replace("train.", "test.") if source_train_file_name.endswith("arff"): self._copy_arff_file(source_test_file_name, target_test_file_name, base_dataset2, mixed_base_dataset) else: os.symlink(source_test_file_name, target_test_file_name) else: # Use the data set from dataset 1 as training set and # the data set from dataset 2 as test data for source_train_file_name in glob.glob(os.sep.join([dataset_dir1, "data_run0", "*_sp*_test.*"])): target_train_file_name = source_train_file_name.replace("test.", "train.") target_train_file_name = target_train_file_name.replace(dataset_dir1, target_dataset_dir) if source_train_file_name.endswith("arff"): self._copy_arff_file(source_train_file_name, target_train_file_name, base_dataset1, mixed_base_dataset) else: os.symlink(source_train_file_name, target_train_file_name) source_test_file_name = source_train_file_name.replace(dataset_dir1, dataset_dir2) target_test_file_name = target_train_file_name.replace("train.", "test.") if source_train_file_name.endswith("arff"): self._copy_arff_file(source_test_file_name, target_test_file_name, base_dataset2, mixed_base_dataset) else: os.symlink(source_test_file_name, target_test_file_name) # Write metadata.yaml based on input meta data input_dataset1_meta = BaseDataset.load_meta_data(dataset_dir1) output_dataset_meta = dict(input_dataset1_meta) output_dataset_meta['train_test'] = True output_dataset_meta['date'] = time.strftime("%Y%m%d_%H_%M_%S") output_dataset_meta['author'] = get_author() BaseDataset.store_meta_data(target_dataset_dir,output_dataset_meta) ############## Clean up after benchmarking ############## super(ShuffleProcess, self).post_benchmarking()