예제 #1
0
    def add_data_block(self, partition_id, x, y):
        dbm = self._dbms[partition_id]

        builder = DataBlockBuilder(
            common.data_source_data_block_dir(self._data_source),
            self._data_source.data_source_meta.name, partition_id,
            dbm.get_dumped_data_block_count(),
            dj_pb.WriterOptions(output_writer="TF_RECORD"), None)
        builder.set_data_block_manager(dbm)
        for i in range(x.shape[0]):
            feat = {}
            exam_id = '{}'.format(i).encode()
            feat['example_id'] = Feature(
                bytes_list=BytesList(value=[exam_id]))
            feat['event_time'] = Feature(
                int64_list = Int64List(value=[i])
            )
            feat['x'] = Feature(float_list=FloatList(value=list(x[i])))
            if y is not None:
                feat['y'] = Feature(int64_list=Int64List(value=[y[i]]))

            example = Example(features=Features(feature=feat))
            builder.append_item(TfExampleItem(example.SerializeToString()), i, 0)

        return builder.finish_data_block()
예제 #2
0
    def _create_data_block(self, data_source, partition_id, x, y):
        data_block_metas = []
        dbm = data_block_manager.DataBlockManager(data_source, partition_id)
        self.assertEqual(dbm.get_dumped_data_block_count(), 0)
        self.assertEqual(dbm.get_lastest_data_block_meta(), None)
        N = 200
        chunk_size = x.shape[0] // N

        leader_index = 0
        follower_index = N * chunk_size * 10
        for i in range(N):
            builder = DataBlockBuilder(
                common.data_source_data_block_dir(data_source),
                data_source.data_source_meta.name,
                partition_id, i,
                dj_pb.WriterOptions(output_writer="TF_RECORD"), None
            )
            builder.set_data_block_manager(dbm)
            for j in range(chunk_size):
                feat = {}
                idx =  i * chunk_size + j
                exam_id = '{}'.format(idx).encode()
                feat['example_id'] = Feature(
                    bytes_list=BytesList(value=[exam_id]))
                evt_time = random.randint(1, 1000)
                feat['event_time'] = Feature(
                    int64_list = Int64List(value=[evt_time])
                )
                feat['x'] = Feature(float_list=FloatList(value=list(x[idx])))
                if y is not None:
                    feat['y'] = Feature(int64_list=Int64List(value=[y[idx]]))

                feat['leader_index'] = Feature(
                    int64_list = Int64List(value=[leader_index])
                )
                feat['follower_index'] = Feature(
                    int64_list = Int64List(value=[follower_index])
                )
                example = Example(features=Features(feature=feat))
                builder.append_item(TfExampleItem(example.SerializeToString()),
                                   leader_index, follower_index)
                leader_index += 1
                follower_index += 1
            data_block_metas.append(builder.finish_data_block())
        self.max_index = follower_index
        return data_block_metas
def _make_serialized_example(x):
    example = Example()
    example.features.feature["x"].float_list.value.append(x)
    return example.SerializeToString()
def generate(shape, window_size=10, start=0, offset=5, noise_samples=0,
             classes=None, damaged_subjects=None):
    if classes is None:
        classes = ["left-fist", "right-fist"]
    if damaged_subjects is None:
        damaged_subjects = ["S088", "S089", "S092", "S100", "S104"]
    if type(shape) not in (tuple, list):
        shape = [shape]

    dataset_dir = os.path.join(PHYSIONET_DIR, "normalized-by-sample",
                               _get_window_folder_name(window_size, start, offset, noise_samples),
                               f"{window_size}x{'x'.join(str(dim) for dim in shape)}",
                               "-".join(classes))
    executions = []
    if any([c in classes for c in ["eyes-closed"]]):
        executions.append("R02")
    if any([c in classes for c in ["left-fist", "right-fist"]]):
        for execution in ["R04", "R08", "R12"]:
            executions.append(execution)
    if any([c in classes for c in ["both-fists", "both-feet"]]):
        for execution in ["R06", "R10", "R14"]:
            executions.append(execution)

    label_value = 0
    labels = {}
    if "eyes-closed" in classes:
        labels["eyes-closed"] = label_value
        label_value += 1
    if "left-fist" in classes:
        labels["left-fist"] = label_value
        label_value += 1
    if "right-fist" in classes:
        labels["right-fist"] = label_value
        label_value += 1
    if "both-fists" in classes:
        labels["both-fists"] = label_value
        label_value += 1
    if "both-feet" in classes:
        labels["both-feet"] = label_value
        label_value += 1

    fsh.recreate_dir(dataset_dir)

    regex_executions = f"({'|'.join(executions)})"
    info = {
        "n_samples_by_subject": 0
    }
    subjects = filter(lambda s: s not in damaged_subjects, sorted(os.listdir(RAW_EDF_FILES_DIR)))
    for subject in filter(lambda f: re.match("S(\\d+)", f), subjects):
        print(f"Generating TFRecord file from the subject {subject} ...")

        X_segments = np.empty((0, 64))
        y = np.empty(0, dtype=np.int64)

        edf_subject_path_dir = os.path.join(RAW_EDF_FILES_DIR, subject)
        edf_file_names = sorted(os.listdir(edf_subject_path_dir))
        for edf_file_name in filter(lambda f: re.match(f"^{subject}{regex_executions}\\.edf$", f),
                                    edf_file_names):
            edf_file = EdfFile(edf_subject_path_dir, edf_file_name)
            events_windows = [next(group) for key, group in groupby(enumerate(edf_file.labels), key=itemgetter(1))]
            n_events = len(events_windows)
            for index, (event_start_index, event) in enumerate(events_windows):
                if event == "rest":
                    continue

                event_start_index += start
                X = edf_file.data[event_start_index:] if index + 1 == n_events \
                    else edf_file.data[event_start_index:events_windows[index + 1][0]]
                n_segments = 0
                for (start_segment, end_segment) in _windows(X, window_size, offset):
                    x_segment = X[start_segment:end_segment]
                    X_segments = np.vstack((X_segments, x_segment))
                    y = np.append(y, labels[event])
                    for _ in range(noise_samples):
                        noise = np.random.normal(0, 1, x_segment.shape)
                        X_segments = np.vstack((X_segments, x_segment + noise))
                        y = np.append(y, labels[event])
                    n_segments += 1
                print(f"X{X.shape} splitted into {n_segments} segments of "
                      f"{window_size} samples with offset of {offset} plus {noise_samples} noise samples")

            edf_file.close()

        if len(y) > info["n_samples_by_subject"]:
            info["n_samples_by_subject"] = len(y)

        print("Labels: ", len(y), len(y[y == 0]), len(y[y == 1]))

        X_segments = X_segments.reshape((-1, window_size, 64))
        print("Has nan: ", np.isnan(X_segments).any())

        tfrecord_subject_filepath = os.path.join(dataset_dir, f"{subject}.tfrecord")
        options = tf.io.TFRecordOptions(compression_type="GZIP")
        with tf.io.TFRecordWriter(tfrecord_subject_filepath, options) as writer:
            for n_segment in range(len(y)):
                X_segment = np.array(list(map(lambda x: _process_record(x, shape), X_segments[n_segment])))

                eeg_example = Example(
                    features=Features(
                        feature={
                            "X": Feature(bytes_list=BytesList(value=[tf.io.serialize_tensor(X_segment).numpy()])),
                            "y": Feature(int64_list=Int64List(value=[y[n_segment]]))
                        }
                    )
                )
                writer.write(eeg_example.SerializeToString())

    print("n_samples_by_subject: ", info["n_samples_by_subject"])
    info_filepath = os.path.join(dataset_dir, "info.pkl")
    with open(info_filepath, "wb") as fp:
        pickle.dump(info, fp, protocol=4)
def generate(dataset_root_dir, events_enum, labels_enum, labels, classes,
             window_size):
    dataset_dir = os.path.join(dataset_root_dir, "normalized-by-sample",
                               f"window-{window_size}",
                               "-".join(classes).lower())

    fsh.recreate_dir(dataset_dir)
    info = {"n_samples_by_file": 0}

    subjects_labels_counts = LabelsCounts()
    subjects_session_labels_counts = LabelsCounts()
    gdf_files_dir = os.path.join(dataset_root_dir, "gdf-files")
    gdf_file_names = filter(lambda f: re.match(".*.gdf", f),
                            sorted(os.listdir(gdf_files_dir)))
    for gdf_file_name in gdf_file_names:
        gdf_file = mne.io.read_raw_gdf(os.path.join(gdf_files_dir,
                                                    gdf_file_name),
                                       preload=True)
        groups_gdf_file = re.match("(.)(\\d{2})(\\d{0,2})([ET])\\.gdf",
                                   gdf_file_name).groups()
        database_prefix = groups_gdf_file[0]
        subject = groups_gdf_file[1]
        session = groups_gdf_file[2]
        session_type = groups_gdf_file[3]

        labels_filepath = os.path.join(
            gdf_files_dir, "labels",
            f"{database_prefix}{subject}{session}{session_type}.mat")
        labels_file = io.loadmat(labels_filepath)["classlabel"]

        annotations = gdf_file.annotations
        start_trials_indexes = [
            event_index for event_index in range(len(annotations.description))
            if events_enum.get(annotations.description[event_index]) ==
            "NEW_TRIAL"
        ]

        indexes_channels_eeg = [
            index for index, _ in enumerate(
                filter(lambda ch: "EEG" in ch, gdf_file.ch_names))
        ]
        n_channels = len(indexes_channels_eeg)
        frequency = int(gdf_file.info["sfreq"])
        # should consider the complete motor imagery period (4s),
        # not only the cue exhibition period (1.25s)
        duration_event = window_size // frequency
        n_samples = frequency * duration_event

        rejected_trials = 0
        ignored_trials = 0
        X = np.empty((0, n_channels))
        y = np.empty(0, dtype=np.int64)
        for n_trial, event_start_trial_index in enumerate(
                start_trials_indexes):
            cue_event_index = event_start_trial_index + 1
            onset_event = annotations.onset[cue_event_index]
            onset_index = int(np.ceil(onset_event * frequency))
            end_index = int(np.ceil(
                (onset_event + duration_event) * frequency))
            event_samples = end_index - onset_index
            if event_samples != n_samples:
                end_index += n_samples - event_samples

            # The event correspondent to the trial is the following the start trial event
            if events_enum[annotations.
                           description[cue_event_index]] == "REJECTED_TRIAL":
                rejected_trials += 1
                continue

            label = labels_enum[labels_file[n_trial][0]]
            if label not in classes:
                ignored_trials += 1
                continue

            # The index 0 returns the data array of the gdf_file
            # The index 1 returns the times array of the gdf_file
            x = gdf_file[indexes_channels_eeg, onset_index:end_index][0].T
            x = (x - np.mean(x)) / np.std(x)
            X = np.vstack((X, x))

            y = np.append(y, labels[label])

        gdf_file.close()
        X = X.reshape((-1, n_samples, n_channels))

        tfrecord_filepath = os.path.join(
            dataset_dir,
            f"{database_prefix}{subject}{session}{session_type}.tfrecord")
        options = tf.io.TFRecordOptions(compression_type="GZIP")
        with tf.io.TFRecordWriter(tfrecord_filepath, options) as writer:
            for n_segment in range(len(y)):
                eeg_example = Example(features=Features(
                    feature={
                        "X":
                        Feature(bytes_list=BytesList(value=[
                            tf.io.serialize_tensor(X[n_segment]).numpy()
                        ])),
                        "y":
                        Feature(int64_list=Int64List(value=[y[n_segment]]))
                    }))
                writer.write(eeg_example.SerializeToString())

        valid_trials = len(y)
        if valid_trials > info["n_samples_by_file"]:
            info["n_samples_by_file"] = valid_trials

        labels_counts = np.unique(y, return_counts=True)[1]
        subjects_labels_counts.put(subject, labels_counts)
        subjects_session_labels_counts.put(subject + session_type,
                                           labels_counts)

        print("Info from file " + gdf_file_name)
        print(f"Labels counts: {labels_counts}")
        print("Valid Trials: " + str(valid_trials))
        print("Ignored Trials: " + str(ignored_trials))
        print("Rejected Trials: " + str(rejected_trials))
        print("Total Trials: " +
              str(valid_trials + ignored_trials + rejected_trials))

    print("Subjects Labels Counts:")
    print(subjects_labels_counts)
    print("Subjects Session Counts:")
    print(subjects_session_labels_counts)

    print("Generation dataset ended, saving info data ...")
    print("info[n_samples_by_file]=", info["n_samples_by_file"])
    info_filepath = os.path.join(dataset_dir, "info.pkl")
    with open(info_filepath, "wb") as fp:
        pickle.dump(info, fp, protocol=4)