def add_data_block(self, partition_id, x, y): dbm = self._dbms[partition_id] builder = DataBlockBuilder( common.data_source_data_block_dir(self._data_source), self._data_source.data_source_meta.name, partition_id, dbm.get_dumped_data_block_count(), dj_pb.WriterOptions(output_writer="TF_RECORD"), None) builder.set_data_block_manager(dbm) for i in range(x.shape[0]): feat = {} exam_id = '{}'.format(i).encode() feat['example_id'] = Feature( bytes_list=BytesList(value=[exam_id])) feat['event_time'] = Feature( int64_list = Int64List(value=[i]) ) feat['x'] = Feature(float_list=FloatList(value=list(x[i]))) if y is not None: feat['y'] = Feature(int64_list=Int64List(value=[y[i]])) example = Example(features=Features(feature=feat)) builder.append_item(TfExampleItem(example.SerializeToString()), i, 0) return builder.finish_data_block()
def _create_data_block(self, data_source, partition_id, x, y): data_block_metas = [] dbm = data_block_manager.DataBlockManager(data_source, partition_id) self.assertEqual(dbm.get_dumped_data_block_count(), 0) self.assertEqual(dbm.get_lastest_data_block_meta(), None) N = 200 chunk_size = x.shape[0] // N leader_index = 0 follower_index = N * chunk_size * 10 for i in range(N): builder = DataBlockBuilder( common.data_source_data_block_dir(data_source), data_source.data_source_meta.name, partition_id, i, dj_pb.WriterOptions(output_writer="TF_RECORD"), None ) builder.set_data_block_manager(dbm) for j in range(chunk_size): feat = {} idx = i * chunk_size + j exam_id = '{}'.format(idx).encode() feat['example_id'] = Feature( bytes_list=BytesList(value=[exam_id])) evt_time = random.randint(1, 1000) feat['event_time'] = Feature( int64_list = Int64List(value=[evt_time]) ) feat['x'] = Feature(float_list=FloatList(value=list(x[idx]))) if y is not None: feat['y'] = Feature(int64_list=Int64List(value=[y[idx]])) feat['leader_index'] = Feature( int64_list = Int64List(value=[leader_index]) ) feat['follower_index'] = Feature( int64_list = Int64List(value=[follower_index]) ) example = Example(features=Features(feature=feat)) builder.append_item(TfExampleItem(example.SerializeToString()), leader_index, follower_index) leader_index += 1 follower_index += 1 data_block_metas.append(builder.finish_data_block()) self.max_index = follower_index return data_block_metas
def _make_serialized_example(x): example = Example() example.features.feature["x"].float_list.value.append(x) return example.SerializeToString()
def generate(shape, window_size=10, start=0, offset=5, noise_samples=0, classes=None, damaged_subjects=None): if classes is None: classes = ["left-fist", "right-fist"] if damaged_subjects is None: damaged_subjects = ["S088", "S089", "S092", "S100", "S104"] if type(shape) not in (tuple, list): shape = [shape] dataset_dir = os.path.join(PHYSIONET_DIR, "normalized-by-sample", _get_window_folder_name(window_size, start, offset, noise_samples), f"{window_size}x{'x'.join(str(dim) for dim in shape)}", "-".join(classes)) executions = [] if any([c in classes for c in ["eyes-closed"]]): executions.append("R02") if any([c in classes for c in ["left-fist", "right-fist"]]): for execution in ["R04", "R08", "R12"]: executions.append(execution) if any([c in classes for c in ["both-fists", "both-feet"]]): for execution in ["R06", "R10", "R14"]: executions.append(execution) label_value = 0 labels = {} if "eyes-closed" in classes: labels["eyes-closed"] = label_value label_value += 1 if "left-fist" in classes: labels["left-fist"] = label_value label_value += 1 if "right-fist" in classes: labels["right-fist"] = label_value label_value += 1 if "both-fists" in classes: labels["both-fists"] = label_value label_value += 1 if "both-feet" in classes: labels["both-feet"] = label_value label_value += 1 fsh.recreate_dir(dataset_dir) regex_executions = f"({'|'.join(executions)})" info = { "n_samples_by_subject": 0 } subjects = filter(lambda s: s not in damaged_subjects, sorted(os.listdir(RAW_EDF_FILES_DIR))) for subject in filter(lambda f: re.match("S(\\d+)", f), subjects): print(f"Generating TFRecord file from the subject {subject} ...") X_segments = np.empty((0, 64)) y = np.empty(0, dtype=np.int64) edf_subject_path_dir = os.path.join(RAW_EDF_FILES_DIR, subject) edf_file_names = sorted(os.listdir(edf_subject_path_dir)) for edf_file_name in filter(lambda f: re.match(f"^{subject}{regex_executions}\\.edf$", f), edf_file_names): edf_file = EdfFile(edf_subject_path_dir, edf_file_name) events_windows = [next(group) for key, group in groupby(enumerate(edf_file.labels), key=itemgetter(1))] n_events = len(events_windows) for index, (event_start_index, event) in enumerate(events_windows): if event == "rest": continue event_start_index += start X = edf_file.data[event_start_index:] if index + 1 == n_events \ else edf_file.data[event_start_index:events_windows[index + 1][0]] n_segments = 0 for (start_segment, end_segment) in _windows(X, window_size, offset): x_segment = X[start_segment:end_segment] X_segments = np.vstack((X_segments, x_segment)) y = np.append(y, labels[event]) for _ in range(noise_samples): noise = np.random.normal(0, 1, x_segment.shape) X_segments = np.vstack((X_segments, x_segment + noise)) y = np.append(y, labels[event]) n_segments += 1 print(f"X{X.shape} splitted into {n_segments} segments of " f"{window_size} samples with offset of {offset} plus {noise_samples} noise samples") edf_file.close() if len(y) > info["n_samples_by_subject"]: info["n_samples_by_subject"] = len(y) print("Labels: ", len(y), len(y[y == 0]), len(y[y == 1])) X_segments = X_segments.reshape((-1, window_size, 64)) print("Has nan: ", np.isnan(X_segments).any()) tfrecord_subject_filepath = os.path.join(dataset_dir, f"{subject}.tfrecord") options = tf.io.TFRecordOptions(compression_type="GZIP") with tf.io.TFRecordWriter(tfrecord_subject_filepath, options) as writer: for n_segment in range(len(y)): X_segment = np.array(list(map(lambda x: _process_record(x, shape), X_segments[n_segment]))) eeg_example = Example( features=Features( feature={ "X": Feature(bytes_list=BytesList(value=[tf.io.serialize_tensor(X_segment).numpy()])), "y": Feature(int64_list=Int64List(value=[y[n_segment]])) } ) ) writer.write(eeg_example.SerializeToString()) print("n_samples_by_subject: ", info["n_samples_by_subject"]) info_filepath = os.path.join(dataset_dir, "info.pkl") with open(info_filepath, "wb") as fp: pickle.dump(info, fp, protocol=4)
def generate(dataset_root_dir, events_enum, labels_enum, labels, classes, window_size): dataset_dir = os.path.join(dataset_root_dir, "normalized-by-sample", f"window-{window_size}", "-".join(classes).lower()) fsh.recreate_dir(dataset_dir) info = {"n_samples_by_file": 0} subjects_labels_counts = LabelsCounts() subjects_session_labels_counts = LabelsCounts() gdf_files_dir = os.path.join(dataset_root_dir, "gdf-files") gdf_file_names = filter(lambda f: re.match(".*.gdf", f), sorted(os.listdir(gdf_files_dir))) for gdf_file_name in gdf_file_names: gdf_file = mne.io.read_raw_gdf(os.path.join(gdf_files_dir, gdf_file_name), preload=True) groups_gdf_file = re.match("(.)(\\d{2})(\\d{0,2})([ET])\\.gdf", gdf_file_name).groups() database_prefix = groups_gdf_file[0] subject = groups_gdf_file[1] session = groups_gdf_file[2] session_type = groups_gdf_file[3] labels_filepath = os.path.join( gdf_files_dir, "labels", f"{database_prefix}{subject}{session}{session_type}.mat") labels_file = io.loadmat(labels_filepath)["classlabel"] annotations = gdf_file.annotations start_trials_indexes = [ event_index for event_index in range(len(annotations.description)) if events_enum.get(annotations.description[event_index]) == "NEW_TRIAL" ] indexes_channels_eeg = [ index for index, _ in enumerate( filter(lambda ch: "EEG" in ch, gdf_file.ch_names)) ] n_channels = len(indexes_channels_eeg) frequency = int(gdf_file.info["sfreq"]) # should consider the complete motor imagery period (4s), # not only the cue exhibition period (1.25s) duration_event = window_size // frequency n_samples = frequency * duration_event rejected_trials = 0 ignored_trials = 0 X = np.empty((0, n_channels)) y = np.empty(0, dtype=np.int64) for n_trial, event_start_trial_index in enumerate( start_trials_indexes): cue_event_index = event_start_trial_index + 1 onset_event = annotations.onset[cue_event_index] onset_index = int(np.ceil(onset_event * frequency)) end_index = int(np.ceil( (onset_event + duration_event) * frequency)) event_samples = end_index - onset_index if event_samples != n_samples: end_index += n_samples - event_samples # The event correspondent to the trial is the following the start trial event if events_enum[annotations. description[cue_event_index]] == "REJECTED_TRIAL": rejected_trials += 1 continue label = labels_enum[labels_file[n_trial][0]] if label not in classes: ignored_trials += 1 continue # The index 0 returns the data array of the gdf_file # The index 1 returns the times array of the gdf_file x = gdf_file[indexes_channels_eeg, onset_index:end_index][0].T x = (x - np.mean(x)) / np.std(x) X = np.vstack((X, x)) y = np.append(y, labels[label]) gdf_file.close() X = X.reshape((-1, n_samples, n_channels)) tfrecord_filepath = os.path.join( dataset_dir, f"{database_prefix}{subject}{session}{session_type}.tfrecord") options = tf.io.TFRecordOptions(compression_type="GZIP") with tf.io.TFRecordWriter(tfrecord_filepath, options) as writer: for n_segment in range(len(y)): eeg_example = Example(features=Features( feature={ "X": Feature(bytes_list=BytesList(value=[ tf.io.serialize_tensor(X[n_segment]).numpy() ])), "y": Feature(int64_list=Int64List(value=[y[n_segment]])) })) writer.write(eeg_example.SerializeToString()) valid_trials = len(y) if valid_trials > info["n_samples_by_file"]: info["n_samples_by_file"] = valid_trials labels_counts = np.unique(y, return_counts=True)[1] subjects_labels_counts.put(subject, labels_counts) subjects_session_labels_counts.put(subject + session_type, labels_counts) print("Info from file " + gdf_file_name) print(f"Labels counts: {labels_counts}") print("Valid Trials: " + str(valid_trials)) print("Ignored Trials: " + str(ignored_trials)) print("Rejected Trials: " + str(rejected_trials)) print("Total Trials: " + str(valid_trials + ignored_trials + rejected_trials)) print("Subjects Labels Counts:") print(subjects_labels_counts) print("Subjects Session Counts:") print(subjects_session_labels_counts) print("Generation dataset ended, saving info data ...") print("info[n_samples_by_file]=", info["n_samples_by_file"]) info_filepath = os.path.join(dataset_dir, "info.pkl") with open(info_filepath, "wb") as fp: pickle.dump(info, fp, protocol=4)