예제 #1
0
    def load_as_tframe_data(cls, data_dir):
        from .dataset import DataSet
        file_path = os.path.join(data_dir, cls.TFD_FILE_NAME)
        if os.path.exists(file_path): return DataSet.load(file_path)

        # If .tfd file does not exist, try to convert from raw data
        console.show_status('Trying to convert raw data to tframe DataSet ...')
        images, labels = cls.load_as_numpy_arrays(data_dir)
        data_set = DataSet(images,
                           labels,
                           name=cls.DATA_NAME,
                           **cls.PROPERTIES)

        # Generate groups if necessary
        if data_set.num_classes is not None:
            groups = []
            dense_labels = misc.convert_to_dense_labels(labels)
            for i in range(data_set.num_classes):
                # Find samples of class i and append to groups
                samples = list(
                    np.argwhere([j == i for j in dense_labels]).ravel())
                groups.append(samples)
            data_set.properties[data_set.GROUPS] = groups

        # Show status
        console.show_status('Successfully converted {} samples'.format(
            data_set.size))
        # Save DataSet
        console.show_status('Saving data set ...')
        data_set.save(file_path)
        console.show_status('Data set saved to {}'.format(file_path))
        return data_set
예제 #2
0
 def gen_batches(self, batch_size, **kwargs):
     assert self.is_ready
     checker.check_positive_integer(self.batches_per_epoch)
     for i in range(self.batches_per_epoch):
         matrix, labels = self._random_signal_matrix(
             batch_size, self.input_size)
         batch = DataSet(matrix, labels)
         batch.name = 'gpat_{}of{}'.format(i + 1, self.batches_per_epoch)
         yield batch
예제 #3
0
 def __init__(self, signals, responses=None, data_dict=None,
              name='signal_set1', **kwargs):
   # Check signals
   signal_dict, fs = self._check_signals(signals, responses)
   data_dict = {} if data_dict is None else data_dict
   data_dict.update(signal_dict)
   kwargs.update({pedia.sampling_frequency: fs})
   # Call parent't constructor
   DataSet.__init__(self, data_dict=data_dict, name=name, **kwargs)
예제 #4
0
def load_data(path, csv_path, fold=0):
    # TODO:
    train = pd.read_csv(csv_path)
    LABELS = list(train.label.unique())
    label_idx = {label: i for i, label in enumerate(LABELS)}
    train.set_index("fname", inplace=True)
    train["label_idx"] = train.label.apply(lambda x: label_idx[x])
    # split the train_set and the val_set
    skf = StratifiedKFold(train.label_idx, n_folds=10)

    for i, (train_split, val_split) in enumerate(skf):
        if i == fold:
            train_split_0 = train_split
            val_split_0 = val_split
            break
    audio_length = 32000
    data_set = DataSet.load(path)
    assert isinstance(data_set, DataSet)

    train_split_data = Gpat_set.split_data_set(train_split_0, data_set)
    val_set = Gpat_set.split_data_set(val_split_0, data_set)
    raw_val_set = val_set
    raw_val_set.properties[raw_val_set.NUM_CLASSES] = 41
    train_set = Gpat_set(features=train_split_data.features,
                         targets=train_split_data.targets,
                         NUM_CLASSES=41)

    train_set.init_groups()
    for i in range(len(val_set.features)):
        if i == 0:
            features = GPAT.length_adapted(val_set.features[i], audio_length)
            mfccs = librosa.feature.mfcc(features, 16000, n_mfcc=50)
            mfccs = np.expand_dims(mfccs, axis=0)
            features = np.reshape(features, (1, -1))
        # targets = batch_data[i].targets
        else:
            feature = GPAT.length_adapted(val_set.features[i], audio_length)
            mfcc = librosa.feature.mfcc(feature, 16000, n_mfcc=50)
            mfcc = np.expand_dims(mfcc, axis=0)
            mfccs = np.concatenate((mfccs, mfcc), axis=0)
            feature = np.reshape(feature, (1, -1))
            features = np.concatenate((features, feature), axis=0)
    targets = val_set.targets
    features = np.expand_dims(features, axis=2)
    mfccs = np.expand_dims(mfccs, axis=-1)
    val_set = DataSet(features, targets, data_dict={'mfcc': mfccs})
    test_set = val_set
    return train_set, val_set, test_set, raw_val_set
예제 #5
0
def load_demo_data(path):
    train = pd.read_csv("../data/original_data/train.csv")
    LABELS = list(train.label.unique())
    label_idx = {label: i for i, label in enumerate(LABELS)}
    train.set_index("fname", inplace=True)
    # test.set_index("fname", inplace=True)
    train["label_idx"] = train.label.apply(lambda x: label_idx[x])
    train_verified = train[train.manually_verified == 1]

    train_csv = train
    # test = pd.read_csv("../data/original_data/sample_submission.csv")
    config = Config(sampling_rate_raw=16000,
                    audio_duration=2,
                    n_folds=10,
                    learning_rate=0.001,
                    use_mfcc=True,
                    n_mfcc=50,
                    sampling_rate=16000)
    X_train, X_train_t = prepare_data(train_csv, config, path)
    # X_train_n = prepare_data(train, config, path, noise=True)
    y_train = to_categorical(train_csv.label_idx, num_classes=config.n_classes)
    X_train, train_mean = preprocess(X_train)
    pickle_data(train_mean, '../data/original_data/train_mean.pkl')
    # TODO:

    # split the train_set and the val_set
    skf = StratifiedKFold(train_csv.label_idx, n_folds=10)

    for i, (train_split, val_split) in enumerate(skf):
        if i == 1:
            train_split_0 = train_split
            val_split_0 = val_split
            break

    X_train_t = np.expand_dims(X_train_t, axis=-1)

    features = X_train_t[train_split_0]
    targets = y_train[train_split_0]
    train_set = DataSet(features=features,
                        targets=targets,
                        data_dict={'mfcc': X_train[train_split_0]})
    features = X_train_t[val_split_0]
    targets = y_train[val_split_0]
    val_set = DataSet(features=features,
                      targets=targets,
                      data_dict={'mfcc': X_train[val_split_0]})

    return train_set, val_set
예제 #6
0
  def emit(self, num_steps):
    assert self.is_ready
    assert isinstance(num_steps, int)
    if num_steps < 0: num_steps = self.max_emit_length

    # Determine steps
    steps = min(self.max_emit_length, num_steps)

    template = self._data_sets[0]
    assert isinstance(template, DataSet)
    data_dict = template.data_dict.copy()
    cursors = None
    for key, data in data_dict.items():
      assert isinstance(data, np.ndarray)
      sample_shape = data.shape[1:]
      data_dict[key] = np.zeros(shape=(self.size, steps, *sample_shape))

      cursors = self._cursors.copy()
      for i in range(self.size):
        array = self._data_sets[i][key]
        assert isinstance(array, np.ndarray)
        c = cursors[i]
        data_dict[key][i] = array[c:c + steps]
        # Move cursor
        assert 0 < c + steps <= len(array)
        cursors[i] += steps

    # Update cursors
    assert cursors is not None
    self._cursors = cursors
    # Wrap data into a DataSet and return
    return DataSet(data_dict=data_dict, is_rnn_input=True)
예제 #7
0
def load_test_data(path, train_mean=None):
    test = pd.read_csv("../data/original_data/sample_submission.csv")
    test.set_index("fname", inplace=True)

    train_csv = test
    # test = pd.read_csv("../data/original_data/sample_submission.csv")
    config = Config(sampling_rate_raw=16000,
                    audio_duration=2,
                    n_folds=10,
                    learning_rate=0.001,
                    use_mfcc=True,
                    n_mfcc=50,
                    sampling_rate=16000)

    train_csv = train_csv.head()

    X_train, X_train_t = prepare_data(train_csv, config, path)
    # X_train_n = prepare_data(train, config, path, noise=True)
    X_train, mean = preprocess(X_train, train_mean=train_mean)
    # TODO:

    X_train_t = np.expand_dims(X_train_t, axis=-1)

    test_set = DataSet(features=X_train_t, data_dict={'mfcc': X_train})

    return test_set
예제 #8
0
파일: bigdata.py 프로젝트: winkywow/tframe
 def _load_data_set(file_name):
     assert isinstance(file_name, str)
     extension = file_name.split('.')[-1]
     if extension == DataSet.EXTENSION:
         return DataSet.load(file_name)
     elif extension == SignalSet.EXTENSION:
         return SignalSet.load(file_name)
     else:
         raise TypeError(
             '!! Can not load file with extension .{}'.format(extension))
예제 #9
0
    def tfr_view(self):
        """View image sequence using tframe.ImageViewer"""
        from tframe.data.images.image_viewer import ImageViewer
        from tframe.data.dataset import DataSet

        images = self.sequence
        images = images / np.max(images)
        ds = DataSet(features=images)
        viewer = ImageViewer(ds)
        viewer.show()
예제 #10
0
    def split_data_set(split_indices, data_set):
        # TODO: only for features are list and targets are ndarrays
        assert isinstance(data_set, DataSet)
        split_features = []
        for id in split_indices:
            split_features.append(data_set.features[id])

        split_targets = data_set.targets[split_indices]
        split_data_set = DataSet(split_features, split_targets)
        return split_data_set
예제 #11
0
  def load_as_tframe_data(cls, data_dir, file_name=None, size=512,
                          unique_=True):
    # Check file_name
    if file_name is None: file_name = cls._get_file_name(size, unique_)
    data_path = os.path.join(data_dir, file_name)
    if os.path.exists(data_path): return DataSet.load(data_path)
    # If data does not exist, create a new one
    console.show_status('Making data ...')
    erg_list = ReberGrammar.make_strings(
      size, unique_, embedded=True, verbose=True)

    # Wrap erg into a DataSet
    features = [erg.one_hot for erg in erg_list]
    targets = [erg.transfer_prob for erg in erg_list]
    data_set = DataSet(features, targets, {'erg_list': erg_list},
                       name='Embedded Reber Grammar')
    console.show_status('Saving data set ...')
    data_set.save(data_path)
    console.show_status('Data set saved to {}'.format(data_path))
    return  data_set
예제 #12
0
    def _gen_rnn_batches(self, x, y, num_steps, *args):
        # Sanity check
        assert isinstance(x, np.ndarray) and isinstance(y, np.ndarray)
        assert isinstance(num_steps, int)
        assert len(x.shape) == 3 and x.shape[2] == self.input_size
        steps = x.shape[1]
        assert y.shape == (x.shape[0], steps, self.NUM_CLASSES)

        # Yield RNN batches
        if num_steps < 0: num_steps = steps
        yield_times = int(np.ceil(steps / num_steps))
        for i in range(yield_times):
            batch_x = x[:, i * num_steps:min((i + 1) * num_steps, steps)]
            batch_y = y[:, i * num_steps:min((i + 1) * num_steps, steps)]
            batch = DataSet(batch_x, batch_y, in_rnn_format=True)

            # State should be reset at the beginning of a sequence
            if i == 0: batch.should_reset_state = True
            batch.name = 'gpat_{}of{}'.format(i + 1, yield_times)
            yield batch
예제 #13
0
 def stack(self):
   """Concatenate this sequence set (a list consists of sequences with shape
      [steps, *dim]) to a regular array with shape [sum(steps), *dim]"""
   if self.DATA_STACK in self.properties.keys():
     stack = self.properties[self.DATA_STACK]
     assert isinstance(stack, DataSet)
     return stack
   self.properties[self.DATA_STACK] = DataSet(
     data_dict=self._apply(np.concatenate, self.merged_data_dict),
     name=self.name + '(stacked)', **self.properties)
   return self.stack
예제 #14
0
    def _gen_rnn_batches_by_wheel(self, batch_size, num_steps, round_len, L,
                                  **_):
        """Each sequence in batch is a sub-sequence of length L of a randomly
       selected sequence. First introduced in sampling LOB data.
       The sub-sequence length L must be specified.
    """
        # Sanity check
        if batch_size < 0 or batch_size is None:
            batch_size = self.size
        if num_steps < 0 or num_steps is None:
            num_steps = L
        # Generate feature list and target list
        features, targets = [], []
        wheel = Wheel(self.structure if th.
                      use_wheel else list(np.ones([self.size]) / self.size))
        for _ in range(batch_size):
            # Choose a sequence to sample from
            index = wheel.spin()
            t = np.random.randint(0, self.structure[index] - L + 1)
            x = self.features[index][t:t + L]
            y = self.targets[index][t:t + L]
            assert len(x) == len(y) == L
            features.append(x)
            targets.append(y)
        # Stack features and targets
        features, targets = np.stack(features), np.stack(targets)
        data_set = DataSet(features, targets, is_rnn_input=True)
        assert data_set.size == batch_size
        # Generate RNN batches using DataSet.gen_rnn_batches
        counter = 0
        for batch in data_set.gen_rnn_batches(batch_size,
                                              num_steps,
                                              is_training=True):
            yield batch
            counter += 1

        # Check round_len
        if counter != round_len:
            raise AssertionError(
                "!! counter = {} while round_len = {}. (batch_size = {}, num_steps={})"
                "".format(counter, round_len, batch_size, num_steps))
예제 #15
0
파일: fi2010.py 프로젝트: winkywow/tframe
    def rnn_batch_generator(data_set, batch_size, num_steps, is_training,
                            round_len):
        """Generated epoch batches are guaranteed to cover all sequences"""
        assert isinstance(data_set, SequenceSet) and is_training
        L = int(sum(data_set.structure) / batch_size)
        assert L < min(data_set.structure) and L == th.sub_seq_len
        rad = int(th.random_shift_pct * L)
        # Distribute batch_size to stocks
        # [23336, 44874, 38549, 54675, 93316]
        num_sequences = wise_man.apportion(data_set.structure, batch_size)
        # Generate feature list and target list
        features, targets = [], []
        for num, x, y in zip(num_sequences, data_set.features,
                             data_set.targets):
            # Find starts for each sequence to sample
            starts = wise_man.spread(len(x), num, L, rad)
            # Sanity check
            assert len(starts) == num
            # Put the sub-sequences into corresponding lists
            for s in starts:
                features.append(x[s:s + L])
                targets.append(y[s:s + L])
        # Stack features and targets
        features, targets = np.stack(features), np.stack(targets)
        data_set = DataSet(features, targets, is_rnn_input=True)
        assert data_set.size == batch_size
        # Generate RNN batches using DataSet.gen_rnn_batches
        counter = 0
        for batch in data_set.gen_rnn_batches(batch_size,
                                              num_steps,
                                              is_training=True):
            yield batch
            counter += 1

        # Check round_len
        if counter != round_len:
            raise AssertionError(
                '!! counter = {} while round_len = {}. (batch_size = {}, num_steps={})'
                ''.format(counter, round_len, batch_size, num_steps))
예제 #16
0
  def load_data_set(self, _):
    filename = filedialog.askopenfilename(
      initialdir=self.last_dir, title='Load data set',
      filetypes=(("TFData files", '*.tfd'),))
    if filename == '':
      return

    self.filename = filename
    self.set_data(DataSet.load(filename))
    self._update_title()

    # Print status
    print(">> Loaded data set '{}'".format(filename))
예제 #17
0
 def padded_stack(self):
   """Stack this sequence set with 0 padded. The output shape is
      (self.size, max_steps, *dim)"""
   if self.PADDED_STACK in self.properties.keys():
     stack = self.properties[self.PADDED_STACK]
     assert isinstance(stack, DataSet)
     return stack
   max_step = max(self.structure)
   f = lambda seqs: self._pad_sequences(seqs, max_step)
   self.properties[self.PADDED_STACK] = DataSet(
     data_dict=self._apply(f, self.merged_data_dict),
     name=self.name + '(padded_stack)', is_rnn_input=True, **self.properties)
   self.padded_stack.active_length = self.structure
   return self.padded_stack
예제 #18
0
파일: core.py 프로젝트: Wuyou98/Image_unet
def activate(export_false=False):
    assert callable(th.model)
    model = th.model(th)
    assert isinstance(model, Predictor)

    train_set, val_set, test_set = load_data(th.data_dir, 600, -1, 1)

    if th.train:
        model.train(train_set,
                    validation_set=val_set,
                    trainer_hub=th,
                    probe=lambda t: probe(t, train_set))
    else:
        from tframe.data.images.image_viewer import ImageViewer
        from tframe.data.dataset import DataSet
        import cv2
        import skimage.transform as transform

        dir = os.path.join(th.data_dir, 'test')
        imgs = []

        images = os.listdir(dir)
        for i in range(len(images)):
            img = cv2.imread(os.path.join(dir, images[i]), 0)
            assert isinstance(img, np.ndarray)
            img = img / 255
            img = transform.resize(img, [256, 256])
            imgs.append(img.reshape(1, 256, 256, 1))

        X = np.concatenate(imgs)

        test_set_a = DataSet(features=X)

        images = model.predict(test_set_a, batch_size=2)
        images = images.reshape([-1, 256, 256])
        viewer = ImageViewer(DataSet(features=images))
        viewer.show()
예제 #19
0
    def load_as_tframe_data(cls, data_dir, **kwargs):

        # Load directly if all files exists
        data_path = cls._get_data_paths(data_dir)
        if os.path.exists(data_path):
            data_set = DataSet.load(data_path)
        else:
            # If data does not exist, create from raw data
            console.show_status('Creating data sets ...')
            data, mapping = cls._load_raw_data(data_dir)
            x = np.array(data[:-1]).reshape(-1, 1)
            y = np.array(data[1:]).reshape(-1, 1)
            data_set = DataSet(x, y, name='Text8.char', mapping=mapping)
            # Save data set and show info
            data_set.save(data_path)
            console.show_status('{} saved to `{}`'.format(
                data_set.name, data_path))

        # Show mapping size
        console.show_status(
            'Data sets (containing {} different characters) loaded:'.format(
                len(data_set['mapping'])))

        return data_set
예제 #20
0
    def set_data(self, data_set):
        if data_set is not None:
            # If a path is given
            if isinstance(data_set, six.string_types):
                data_set = DataSet.load(data_set)
            if not isinstance(data_set, DataSet):
                raise TypeError(
                    '!! Data set must be an instance of tframe DataSet')
            if not data_set.is_regular_array:
                data_set = data_set.stack
            self.data_set = data_set
            self._set_cursor(0)
            if self.data_set.targets is not None:
                self.labels = misc.convert_to_dense_labels(
                    self.data_set.targets)
            console.show_status('Data set set to ImageViewer')

            # Refresh image viewer
            self.refresh()
예제 #21
0
  def get_round_length(batch_size, num_steps, lengths, len_f=None):
    checker.check_type(lengths, int)
    pe = ParallelEngine(batch_size)
    round_len, cursor = 0, 0

    while True:
      # Set sequences and targets if necessary
      while not pe.is_ready:
        if cursor < len(lengths):
          length = lengths[cursor] if len_f is None else len_f(lengths[cursor])
          ds = DataSet(features=np.zeros(shape=(length, 1)))
          cursor += 1
        else: ds = None
        pe.set_data(ds)

      if pe.flameout: break
      pe.emit(num_steps)
      round_len += 1

    return round_len
예제 #22
0
def image_augmentation_processor(data_batch: DataSet, is_training: bool):
    # Get hub
    th = tfr.hub
    if not is_training or th.aug_config is None: return data_batch
    # Parse augmentation setting
    assert isinstance(th.aug_config, str)
    configs = [Parser.parse(s) for s in th.aug_config.split('|')]
    if len(configs) == 0: return data_batch

    # Apply each method according to configs
    for cfg in configs:
        # Find method
        if cfg.name == 'rotate': method = _rotate
        elif cfg.name == 'flip': method = _flip
        else:
            raise KeyError('!! Unknown augmentation option {}'.format(
                cfg.name))
        # Do augmentation
        data_batch.features = method(data_batch.features, *cfg.arg_list,
                                     **cfg.arg_dict)

    return data_batch
예제 #23
0
    def set_data(self, data_set):
        if data_set is not None:
            # If a path is given
            if isinstance(data_set, six.string_types):
                data_set = DataSet.load(data_set)
            if not isinstance(data_set, DataSet):
                raise TypeError(
                    '!! Data set must be an instance of tframe DataSet')
            if not data_set.is_regular_array:
                data_set = data_set.stack
            self.data_set = data_set
            self._set_cursor(0)
            # For DataSet like MNIST and CIFAR-XXX
            if self.data_set.targets is not None:
                if len(self.data_set.targets.shape) == 1:
                    self.labels = self.data_set.targets
                elif len(self.data_set.targets.shape) == 2:
                    self.labels = misc.convert_to_dense_labels(
                        self.data_set.targets).flatten()
            # Consider DataSets in image segmentation tasks
            interleave_key = self.kwargs.get('interleave_key', None)
            if interleave_key is not None:
                if not interleave_key in data_set.data_dict.keys():
                    raise KeyError('!! Can not find `{}` in DataSet'.format(
                        interleave_key))
                else:
                    shadows = getattr(data_set, interleave_key)
                    features = data_set.features
                    assert shadows.shape == features.shape
                    images = []
                    for x, y, in zip(features, shadows):
                        images.append(np.reshape(x, (1, ) + x.shape))
                        images.append(np.reshape(y, (1, ) + y.shape))
                    data_set.features = np.concatenate(images, axis=0)

            console.show_status('Data set set to ImageViewer')

            # Refresh image viewer
            self.refresh()
예제 #24
0
    def gen_batches(self, batch_size, shuffle=False):
        round_len = self.get_round_length(batch_size)
        for i in range(round_len):
            if shuffle:
                indices = (self._rand_indices(size=batch_size))
            else:
                range(i * batch_size,
                      min((i + 1) * batch_size, len(self.targets)))

            batch_features = []
            for indice in indices:
                batch_features.append(self.features[indice])
            for i in range(len(indices)):
                if i == 0:
                    features = GPAT.length_adapted(batch_features[i],
                                                   self.audio_length)
                    mfccs = librosa.feature.mfcc(features, 16000, n_mfcc=50)
                    mfccs = np.expand_dims(mfccs, axis=0)
                    features = GPAT.audio_norm(features)
                    features = np.reshape(features, (1, -1))
                    # targets = batch_data[i].targets
                else:
                    feature = GPAT.length_adapted(batch_features[i],
                                                  self.audio_length)
                    mfcc = librosa.feature.mfcc(feature, 16000, n_mfcc=50)
                    mfcc = np.expand_dims(mfcc, axis=0)
                    mfccs = np.concatenate((mfccs, mfcc), axis=0)
                    feature = GPAT.audio_norm(feature)
                    feature = np.reshape(feature, (1, -1))
                    features = np.concatenate((features, feature), axis=0)
                    # targets = np.concatenate((targets, batch_data[i].targets), axis=0)
            targets = self.targets[indices]
            features = np.expand_dims(features, axis=2)
            mfccs = np.expand_dims(mfccs, axis=-1)
            output_batch_data = DataSet(features,
                                        targets,
                                        data_dict={'mfcc': mfccs})
            yield output_batch_data
예제 #25
0
data_generator = DataGenerator(config=config,
                               data_dir='../data/original_data/audio_train/',
                               list_IDs=train.index,
                               labels=train["label_idx"])
batches = len(train.index) // 64
for i in range(batches):
    feature, target = data_generator[i]
    if i == 0:
        features = feature
        targets = target
    else:
        features = np.concatenate((features, feature), axis=0)
        targets = np.concatenate((targets, target), axis=0)

demo_data = DataSet(features=features, targets=targets)
demo_data.save('../data/processed_data/demo_data_0')
a = data_generator[2]
b = a[0]
c = 1
for i in range(len(val_set.features)):
    if i == 0:
        features = GPAT.length_adapted(val_set.features[i], audio_length)
        features = np.reshape(features, (1, -1))
    # targets = batch_data[i].targets
    else:
        feature = GPAT.length_adapted(val_set.features[i], audio_length)
        feature = np.reshape(feature, (1, -1))
        features = np.concatenate((features, feature), axis=0)
# targets = np.concatenate((targets, batch_data[i].targets), axis=0)
targets = val_set.targets
예제 #26
0
 def f(u):
     assert isinstance(model, Predictor)
     return np.ravel(model.predict(DataSet(features=u)))