Пример #1
0
 def load_as_tframe_data(cls,
                         data_dir,
                         file_name=None,
                         permute=False,
                         permute_mark='alpha',
                         **kwargs):
     # Check file name
     if file_name is None:
         file_name = cls._get_file_name(permute, permute_mark) + '.tfds'
     data_path = os.path.join(data_dir, file_name)
     if os.path.exists(data_path): return SequenceSet.load(data_path)
     # If data does not exist, create a new data set
     console.show_status('Creating data ...')
     images, labels = MNIST.load_as_numpy_arrays(data_dir)
     # images (70000, 784, 1), np.float64
     images = images.reshape(images.shape[0], -1, 1) / 255.
     # permute images if necessary
     if permute:
         images = np.swapaxes(images, 0, 1)
         images = np.random.permutation(images)
         images = np.swapaxes(images, 0, 1)
     # labels (70000, 10), np.float64
     labels = convert_to_one_hot(labels, 10)
     # Wrap data into a Sequence Set
     features = [image for image in images]
     targets = [label for label in labels]
     data_set = SequenceSet(features,
                            summ_dict={'targets': targets},
                            n_to_one=True,
                            name='pMNIST')
     console.show_status('Saving data set ...')
     data_set.save(data_path)
     console.show_status('Data set saved to `{}`'.format(data_path))
     return data_set
Пример #2
0
  def load_as_tframe_data(cls, data_dir, train_size=1000, test_size=200,
                          file_name=None, unique_=True, cheat=True,
                          local_binary=True, multiple=1, rule=None):
    assert rule in ('lstm97', 'pau19', None)

    # Check file_name
    if file_name is None:
      file_name = cls._get_file_name(
        train_size, test_size, unique_, cheat, local_binary, multiple, rule)
    data_path = os.path.join(data_dir, file_name)
    if os.path.exists(data_path): return SequenceSet.load(data_path)
    # If data does not exist, create a new one
    console.show_status('Making data ...')

    if rule == 'pau19':
      erg_list = ReberGrammar.make_strings(
        train_size + test_size, True, embedded=True, multiple=multiple,
        verbose=True)
    elif rule == 'lstm97':
      train_list = ReberGrammar.make_strings(
        train_size, False, embedded=True, verbose=True, multiple=multiple)
      test_list = ReberGrammar.make_strings(
        test_size, False, embedded=True, exclusive=train_list, verbose=True,
        multiple=multiple)
      erg_list = train_list + test_list
    else:
      erg_list = ReberGrammar.make_strings(
        train_size + test_size, unique_, embedded=True, verbose=True,
        multiple=multiple)

    # Wrap erg into a DataSet
    features = [erg.one_hot for erg in erg_list]
    val_targets = [erg.local_binary if local_binary else erg.transfer_prob
                   for erg in erg_list]
    targets = ([erg.observed_prob for erg in erg_list]
               if not cheat else val_targets)
    # targets = [erg.transfer_prob for erg in erg_list]
    data_set = SequenceSet(
      features, targets, data_dict={'val_targets': val_targets},
      erg_list=tuple(erg_list), name='Embedded Reber Grammar')
    console.show_status('Saving data set ...')
    data_set.save(data_path)
    console.show_status('Data set saved to {}'.format(data_path))
    return data_set
Пример #3
0
    def load_as_tframe_data(cls,
                            data_dir,
                            auction=False,
                            norm_type="zscore",
                            setup=None,
                            file_slices=None,
                            **kwargs):
        # Confirm type of normalization
        nt_lower = norm_type.lower()
        # 'Zscore' for directory names and 'ZScore' for file names
        if nt_lower in ["1", "zscore"]:
            type_id, norm_type = 1, "Zscore"
        elif nt_lower in ["2", "minmax"]:
            type_id, norm_type = 2, "MinMax"
        elif nt_lower in ["3", "decpre"]:
            type_id, norm_type = 3, "DecPre"
        else:
            raise KeyError(
                "Unknown type of normalization `{}`".format(norm_type))
        # Load directly if dataset exists
        data_path = cls._get_data_path(data_dir, auction, norm_type, setup)
        if os.path.exists(data_path):
            return SequenceSet.load(data_path)
        # If dataset does not exist, create from raw data
        console.show_status("Creating `{}` from raw data ...".format(
            os.path.basename(data_path)))
        # Load raw data
        features, targets = cls._load_raw_data(data_dir,
                                               auction,
                                               norm_type,
                                               type_id,
                                               file_slices=file_slices)

        # Wrap raw data into tframe Sequence set
        data_dict = {"raw_data": features}
        data_dict.update(targets)
        seq_set = SequenceSet(data_dict=data_dict, name=cls.DATA_NAME)
        # Save Sequence set
        seq_set.save(data_path)
        console.show_status("Sequence set saved to `{}`".format(data_path))
        # Return
        return seq_set
Пример #4
0
    def load_as_tframe_data(cls, data_dir, num_words=10000, **kwargs):
        # Load directly if data set exists
        data_path = cls._get_data_path(data_dir, num_words)
        if os.path.exists(data_path): return SequenceSet.load(data_path)
        # If data does not exist, create from raw data
        console.show_status('Creating data sets ...')
        (train_data,
         train_labels), (test_data,
                         test_labels) = cls._load_raw_data(data_dir,
                                                           num_words=num_words)
        data_list = list(train_data) + list(test_data)
        features = [np.array(cmt).reshape([-1, 1]) for cmt in data_list]

        targets = list(np.concatenate((train_labels, test_labels)))

        data_set = SequenceSet(features,
                               summ_dict={'targets': targets},
                               n_to_one=True,
                               name='IMDB')
        console.show_status('Saving data set ...')
        data_set.save(data_path)
        console.show_status('Data set saved to `{}`'.format(data_path))
        return data_set
Пример #5
0
    def load_as_tframe_data(cls,
                            data_dir,
                            file_name=None,
                            rgb=True,
                            permute=False,
                            permute_mark='alpha',
                            **kwargs):
        assert rgb and not permute
        # Check file name
        if file_name is None:
            file_name = cls._get_file_name(rgb, permute,
                                           permute_mark) + '.tfds'
        data_path = os.path.join(data_dir, file_name)
        if os.path.exists(data_path): return SequenceSet.load(data_path)

        # If data does not exist, create a new data set
        console.show_status('Creating data ...')
        images, labels = CIFAR10.load_as_numpy_arrays(data_dir)
        # images (60000, 32, 32, 3), np.float64
        images = images.reshape(60000, 32 * 32, 3 if rgb else 1) / 255.
        # permute images if necessary
        if permute: raise NotImplementedError

        # labels (60000, 10), np.int32
        labels = convert_to_one_hot(labels, 10)
        # Wrap data into a Sequence Set
        features = [image for image in images]
        targets = [label for label in labels]
        data_set = SequenceSet(features,
                               summ_dict={'targets': targets},
                               n_to_one=True,
                               name='sCIFAR10')
        console.show_status('Saving data set ...')
        data_set.save(data_path)
        console.show_status('Data set saved to `{}`'.format(data_path))
        return data_set
Пример #6
0
    def load_raw_LOBs(cls, data_dir, auction=False):
        # Load directly if dataset exists
        data_path = cls._get_data_path(data_dir, auction=auction)
        if os.path.exists(data_path): return SequenceSet.load(data_path)
        # Otherwise restore raw LOBs from decimal precision data
        dp_set = cls.load_as_tframe_data(data_dir,
                                         auction=auction,
                                         norm_type='decpre',
                                         setup=9,
                                         file_slices=(slice(8, 9), slice(8,
                                                                         9)))
        # Extract first 40 dimensions in de_set.raw_data
        dp_lob_list = [array[:, :40] for array in dp_set.data_dict['raw_data']]
        # Set parameters for restoration
        p_coef, v_coef = 10000, 100000
        coefs = np.array([p_coef, v_coef] * 20).reshape(1, 40)
        lob_list = [array * coefs for array in dp_lob_list]
        # Check targets
        cls._check_targets(data_dir, auction, dp_set.data_dict)
        # Check lob list
        cls._check_raw_lob(data_dir, auction, lob_list, raise_err=True)

        # Separate sequences for each stock
        # i  0 1 2 3 4 5 6 7
        # --------------------
        #    1 1 0 0 0 1 1 1        := x
        #      1 1 0 0 0 1 1 1
        # d  x 0 1 0 0 1 0 0 x      x[0:2], x[2:5], x[5:8]
        # --------------------
        # j    0 1 2 3 4 5 6
        #        *     *
        # |x[1:] - x[:-1]| reveals cliffs
        LOBs = [[] for _ in range(5)]
        horizons = [10, 20, 30, 50, 100]
        targets = {h: [[] for _ in range(5)] for h in horizons}
        for j, lobs in enumerate(lob_list):
            # Find cliff indices
            max_delta = 300 if auction else 200
            indices = cls._get_cliff_indices(lobs,
                                             auction,
                                             max_delta=max_delta)
            # Fill LOBs
            from_i = 0
            for stock in range(5):
                to_i = (indices[stock] + 1) if stock < 4 else len(lobs)
                slc = slice(from_i, to_i)
                LOBs[stock].append(lobs[slc])
                for h in horizons:
                    targets[h][stock].append(dp_set.data_dict[h][j][slc])
                if stock != 4: from_i = indices[stock] + 1
        # Generate new data_dict
        data_dict = {
            h: [np.concatenate(tgt_list) for tgt_list in tgt_lists]
            for h, tgt_lists in targets.items()
        }
        data_dict['raw_data'] = [np.concatenate(lb_list) for lb_list in LOBs]
        # Initiate a new seq_set
        seq_set = SequenceSet(data_dict=data_dict,
                              name='FI-2010-LOBs',
                              **{
                                  cls.LEN_PER_DAY_PER_STOCK:
                                  cls._get_len_per_day_per_stock(
                                      data_dir, auction)
                              })
        # Sanity check (394337)
        assert sum(seq_set.structure) == sum(cls.DAY_LENGTH[auction])
        # Save and return
        seq_set.save(filename=data_path)
        console.show_status('{} saved to `{}`'.format(seq_set.name, data_path))
        return seq_set