示例#1
0
    def save_query_to_hdf5_point(self, save_path_queries_hdf5, entry_id,
                                 sample_original_space):
        """
        :return:
        """
        sample_original_space = to_vector(sample_original_space).T
        if os.path.isfile(save_path_queries_hdf5):
            with h5py.File(save_path_queries_hdf5, 'r+') as hf:
                points_dataset = hf.get('point_queries')
                already_in_points_ds = points_dataset.shape[0]
                points_dataset.resize(already_in_points_ds +
                                      sample_original_space.shape[0],
                                      axis=0)
                points_dataset[
                    already_in_points_ds:already_in_points_ds +
                    sample_original_space.shape[0], :] = sample_original_space

                entryids_dataset = hf.get('entry_ids')
                already_in_entryids_ds = entryids_dataset.len()
                entryids_dataset.resize(already_in_entryids_ds + 1, axis=0)
                entryids_dataset[
                    already_in_entryids_ds:already_in_entryids_ds +
                    1] = entry_id

                split_dict = {
                    "data": {
                        "point_queries": (0, already_in_points_ds +
                                          sample_original_space.shape[0]),
                        "entry_ids": (0, already_in_entryids_ds + 1)
                    }
                }
                hf.attrs["split"] = H5PYDataset.create_split_array(split_dict)
        else:
            # HDF5 query line save file does not exist yet!
            f = h5py.File(save_path_queries_hdf5, "w")

            points_dataset = f.create_dataset(
                'point_queries',
                sample_original_space.shape,
                maxshape=(None, sample_original_space.shape[1]),
                dtype="float32")
            points_dataset[...] = sample_original_space
            entryids_dataset = f.create_dataset('entry_ids', (1, ),
                                                maxshape=(None, ),
                                                dtype=int)
            entryids_dataset[...] = entry_id

            split_dict = {
                "data": {
                    "point_queries": (0, sample_original_space.shape[0]),
                    "entry_ids": (0, 1)
                }
            }
            f.attrs['split'] = H5PYDataset.create_split_array(split_dict)
            f.flush()
            f.close()
示例#2
0
 def get_dataset(self, part, max_length=None):
     if not part in self._dataset_cache:
         part_path = self.get_dataset_path(part)
         if self._layout == 'lambada' and part == 'train':
             self._dataset_cache[part] = H5PYDataset(part_path, ('train', ))
         elif self._layout == 'squad':
             self._dataset_cache[part] = SQuADDataset(part_path, ('all', ))
         elif self._layout == 'snli' or self._layout == 'mnli':
             self._dataset_cache[part] = H5PYDataset(h5py.File(part_path, "r"), \
                 ('all',), sources=('sentence1', 'sentence2', 'label',), load_in_memory=True)
         else:
             self._dataset_cache[part] = TextDataset(part_path, max_length)
     return self._dataset_cache[part]
示例#3
0
    def save_db_point_to_hdf5(self, db_point_scaled_space):
        """
        Save a decision boundary annotation to hdf5.
        :param db_point_scaled_space: (n_samples, n_features)
        :return:
        """
        try:
            db_point_original_space = self.scaling_transformation.inverse_transform(
                db_point_scaled_space)  # shape (1,nlat)
            if os.path.isfile(self.save_path_dbpoints_hdf5):
                with h5py.File(self.save_path_dbpoints_hdf5, 'r+') as hf:
                    dbpoints_dataset = hf.get('db_points')
                    already_in_dataset = dbpoints_dataset.shape[0]
                    dbpoints_dataset.resize(already_in_dataset +
                                            db_point_original_space.shape[0],
                                            axis=0)
                    dbpoints_dataset[already_in_dataset:already_in_dataset +
                                     db_point_original_space.
                                     shape[0], :] = db_point_original_space

                    split_dict = {
                        "data": {
                            "db_points": (0, already_in_dataset +
                                          db_point_original_space.shape[0])
                        }
                    }
                    hf.attrs["split"] = H5PYDataset.create_split_array(
                        split_dict)
            else:
                # HDF5 query line save file does not exist yet!
                f = h5py.File(self.save_path_dbpoints_hdf5, "w")
                dbpoints_dataset = f.create_dataset(
                    'db_points',
                    db_point_original_space.shape,
                    maxshape=(None, db_point_original_space.shape[1]),
                    dtype="float32")
                dbpoints_dataset[...] = db_point_original_space

                split_dict = {
                    "data": {
                        "db_points": (0, db_point_original_space.shape[0])
                    }
                }
                f.attrs['split'] = H5PYDataset.create_split_array(split_dict)
                f.flush()
                f.close()
        except Exception:
            traceback.print_exc()
示例#4
0
文件: ilsvrc2012.py 项目: Scyfer/fuel
def prepare_hdf5_file(hdf5_file, n_train, n_valid, n_test):
    """Create datasets within a given HDF5 file.

    Parameters
    ----------
    hdf5_file : :class:`h5py.File` instance
        HDF5 file handle to which to write.
    n_train : int
        The number of training set examples.
    n_valid : int
        The number of validation set examples.
    n_test : int
        The number of test set examples.

    """
    n_total = n_train + n_valid + n_test
    n_labeled = n_train + n_valid
    splits = create_splits(n_train, n_valid, n_test)
    hdf5_file.attrs['split'] = H5PYDataset.create_split_array(splits)
    vlen_dtype = h5py.special_dtype(vlen=numpy.dtype('uint8'))
    hdf5_file.create_dataset('encoded_images', shape=(n_total,),
                             dtype=vlen_dtype)
    hdf5_file.create_dataset('targets', shape=(n_labeled, 1),
                             dtype=numpy.int16)
    hdf5_file.create_dataset('filenames', shape=(n_total, 1), dtype='S32')
示例#5
0
文件: celeba.py 项目: Afrik/fuel
def _initialize_conversion(directory, output_path, image_shape):
    h5file = h5py.File(output_path, mode='w')
    split_dict = {
        'train': {
            'features': (0, TRAIN_STOP),
            'targets': (0, TRAIN_STOP)},
        'valid': {
            'features': (TRAIN_STOP, VALID_STOP),
            'targets': (TRAIN_STOP, VALID_STOP)},
        'test': {
            'features': (VALID_STOP, NUM_EXAMPLES),
            'targets': (VALID_STOP, NUM_EXAMPLES)}}
    h5file.attrs['split'] = H5PYDataset.create_split_array(split_dict)

    targets_dataset = h5file.create_dataset(
        'targets', (NUM_EXAMPLES, 40), dtype='uint8')
    targets_dataset.dims[0].label = 'batch'
    targets_dataset.dims[1].label = 'target'
    targets_dataset[...] = (
        numpy.loadtxt(os.path.join(directory, ATTRIBUTES_FILE), dtype='int32',
                      skiprows=2, usecols=tuple(range(1, 41))) +
        1) / 2

    features_dataset = h5file.create_dataset(
        'features', (NUM_EXAMPLES, 3) + image_shape, dtype='uint8')
    features_dataset.dims[0].label = 'batch'
    features_dataset.dims[1].label = 'channel'
    features_dataset.dims[2].label = 'height'
    features_dataset.dims[3].label = 'width'

    return h5file
示例#6
0
def get_comb_stream(fea2obj,
                    which_set,
                    batch_size=None,
                    shuffle=True,
                    num_examples=None):
    streams = []
    for fea in fea2obj:
        obj = fea2obj[fea]
        dataset = H5PYDataset(obj.fuelfile,
                              which_sets=(which_set, ),
                              load_in_memory=True)
        if batch_size == None: batch_size = dataset.num_examples
        if num_examples == None: num_examples = dataset.num_examples
        if shuffle:
            iterschema = ShuffledScheme(examples=num_examples,
                                        batch_size=batch_size,
                                        rng=numpy.random.RandomState(seed))
        else:
            iterschema = SequentialScheme(examples=num_examples,
                                          batch_size=batch_size)
        stream = DataStream(dataset=dataset, iteration_scheme=iterschema)
        if fea in seq_features:
            stream = CutInput(stream, obj.max_len)
            if obj.rec == True:
                logger.info('transforming data for recursive input')
                stream = LettersTransposer(
                    stream, which_sources=fea
                )  # Required because Recurrent last_hid receive as input [sequence, batch,# features]
        streams.append(stream)
    stream = Merge(streams, tuple(fea2obj.keys()))
    return stream, num_examples
示例#7
0
def build_entvec_ds(trnMentions, devMentions, tstMentions, t2idx, hdf5_file, vectorfile, upto=-1):
    (embeddings, word2idx, vectorsize) = read_embeddings(vectorfile, upto)
    totals = len(trnMentions) + len(devMentions) + len(tstMentions) 
    input_entvec = numpy.zeros(shape=(totals, vectorsize), dtype='float32')
    for i, men in enumerate(trnMentions + devMentions + tstMentions):
        mye = men.entityId
        entvec = numpy.zeros(vectorsize)
        if mye in word2idx:
            entvec = embeddings[word2idx[mye]]
        input_entvec[i] = entvec
    print input_entvec.shape
    hdf5_file += '_entvec.h5py'
    f = h5py.File(hdf5_file, mode='w')
    features = f.create_dataset('entvec', input_entvec.shape, dtype='float32')  # @UndefinedVariable
    features.attrs['vectorsize'] = vectorsize
    features[...] = input_entvec
    features.dims[0].label = 'entity_vector'
    nsamples_train = len(trnMentions); nsamples_dev = len(devMentions);
    split_dict = {
        'train': {'entvec': (0, nsamples_train)},
        'dev': {'entvec': (nsamples_train, nsamples_train + nsamples_dev)}, 
        'test': {'entvec': (nsamples_train + nsamples_dev, totals)}}    
    f.attrs['split'] = H5PYDataset.create_split_array(split_dict)
    f.flush()
    f.close()
    logger.info('Building entityVec dataset finished. It saved in: %s', hdf5_file)
示例#8
0
 def load_data(hdf5_file, set, sources):
     data = H5PYDataset(hdf5_file,
                        which_sets=(set, ),
                        sources=sources,
                        load_in_memory=True)
     X, y = data.data_sources
     return X.astype(np.float32), y.astype(np.float32)
示例#9
0
def load_decision_boundary_from_hdf5(hdf5_file, index):
    data = H5PYDataset(hdf5_file,
                       which_sets=("data", ),
                       sources=("w", "b"),
                       load_in_memory=True)
    w, b = data.data_sources
    return {"w": w[index, :].reshape((w.shape[1], 1)), "b0": b[index]}
示例#10
0
    def get_dataset(self, which_sets, load_in_memory=False, **kwargs):
        """Return fuel dataset object specified by which_sets tuple and load it in memory

        Args:
            which_sets (:obj:`tuple` of :obj:`str`):  containing the name of splits to load.
                Valid value are determined by the ``info.pkl`` loaded.
                You can get the list of split set names by :meth:`get_set_list()`.
                Usually, if the dataset is split by weeks, the split name is in the form of ``week <num>``.
                If the dataset is split by days, the split name is in the form of ``day <num>``.
            load_in_memory (:obj:`bool`, Optional): Default to False.
                Whether to load the data in main memory.

        Returns:
            :class:`fuel.datasets.base.Dataset`: A Fuel dataset object created by
                :class:`fuel.datasets.h5py.H5PYDataset`
        """
        # Check if sets exist as split name in metadata
        for set_name in which_sets:
            if set_name not in self.info['split_sets']:
                logger.error('set %s not found in splits' % set_name)
        # Load specified splits and return
        return H5PYDataset(file_or_path=self.data_filename,
                           which_sets=which_sets,
                           load_in_memory=load_in_memory,
                           **kwargs)
示例#11
0
def build_contexts_ds(config, all_contexts, nsamples_train, nsamples_dev,
                      nsamples_test, nsamples_dev_big):
    logger.info('building contexts dataset')
    totals = len(all_contexts)
    ctx_dtype = h5py.special_dtype(vlen=np.dtype('int32'))
    logger.info("#contexts: %d", totals)
    with h5py.File(config['dsdir'] + "_contexts.hdf", mode='w') as fp:
        contexts = fp.create_dataset(
            'contexts',
            compression='gzip',
            data=[np.asarray(ctx.context) for ctx in all_contexts],
            shape=(totals, ),
            dtype=ctx_dtype)
        split_dict = {
            'train': {
                'contexts': (0, nsamples_train)
            },
            'dev': {
                'contexts': (nsamples_train, nsamples_train + nsamples_dev)
            },
            'test': {
                'contexts': (nsamples_train + nsamples_dev,
                             nsamples_train + nsamples_dev + nsamples_test)
            },
            'devbig': {
                'contexts':
                (nsamples_train + nsamples_dev + nsamples_test, totals)
            }
        }
        fp.attrs['split'] = H5PYDataset.create_split_array(split_dict)
示例#12
0
def build_mentions_ds(config,
                      all_contexts,
                      nsamples_train,
                      nsamples_dev,
                      nsamples_test,
                      nsamples_dev_big,
                      max_len_men=4):
    logger.info('building mentions (indices of mention words) dataset')
    totals = len(all_contexts)
    dsdir = config['dsdir']
    mentions_m = numpy.ones(shape=(totals, max_len_men), dtype='int32')
    for i, ctx in enumerate(all_contexts):
        mentions_m[i] = ctx.mention
    with h5py.File(dsdir + "_mentions.hdf", mode='w') as fp:
        mentions = fp.create_dataset('mentions',
                                     mentions_m.shape,
                                     dtype='int32')
        mentions[...] = mentions_m
        split_dict = {
            'train': {
                'mentions': (0, nsamples_train)
            },
            'dev': {
                'mentions': (nsamples_train, nsamples_train + nsamples_dev)
            },
            'test': {
                'mentions': (nsamples_train + nsamples_dev,
                             nsamples_train + nsamples_dev + nsamples_test)
            },
            'devbig': {
                'mentions':
                (nsamples_train + nsamples_dev + nsamples_test, totals)
            }
        }
        fp.attrs['split'] = H5PYDataset.create_split_array(split_dict)
示例#13
0
def get_dev_stream(path, **kwargs):
    """Setup development set stream if necessary."""

    sources = ('words', 'audio', 'words_ends', 'punctuation_marks', 'phones', 'phones_words_ends', 'phones_words_acoustic_ends', 'text', 'uttids')
    #sources = ('words', 'audio', 'words_ends', 'punctuation_marks', 'phones', 'phones_words_ends', 'text', 'uttids')
    dataset = H5PYDataset(path, which_sets=('dev',), sources=sources)
    return dataset.get_example_stream()
示例#14
0
def prepare_hdf5_file(hdf5_file, n_train, n_valid, n_test):
    """Create datasets within a given HDF5 file.

    Parameters
    ----------
    hdf5_file : :class:`h5py.File` instance
        HDF5 file handle to which to write.
    n_train : int
        The number of training set examples.
    n_valid : int
        The number of validation set examples.
    n_test : int
        The number of test set examples.

    """
    n_total = n_train + n_valid + n_test
    n_labeled = n_train + n_valid
    splits = create_splits(n_train, n_valid, n_test)
    hdf5_file.attrs['split'] = H5PYDataset.create_split_array(splits)
    vlen_dtype = h5py.special_dtype(vlen=numpy.dtype('uint8'))
    hdf5_file.create_dataset('encoded_images',
                             shape=(n_total, ),
                             dtype=vlen_dtype)
    hdf5_file.create_dataset('targets',
                             shape=(n_labeled, 1),
                             dtype=numpy.int16)
    hdf5_file.create_dataset('filenames', shape=(n_total, 1), dtype='S32')
示例#15
0
def build_letters_ds(trnMentions, devMentions, tstMentions, t2idx, dsdir, vectorfile=None, max_len_name=30):
    char_to_idx, idx_to_char = build_char_vocab(trnMentions) #train for characters because we only use entities names for characters
    totals = len(trnMentions) + len(devMentions) + len(tstMentions) 
    input_letters = numpy.zeros(shape=(totals, max_len_name), dtype='int32')
    for i, men in enumerate(trnMentions + devMentions + tstMentions):
        name = men.name
        input_letters[i] = get_ngram_seq(char_to_idx, name, max_len_name)
    print input_letters.shape
    fuelfile = dsdir +'_letters.h5py'
    f = h5py.File(fuelfile, mode='w')
    features = f.create_dataset('letters', input_letters.shape, dtype='int32')  # @UndefinedVariable
    features.attrs['voc2idx'] = yaml.dump(char_to_idx, default_flow_style=False)
    features.attrs['idx2voc'] = yaml.dump(idx_to_char, default_flow_style=False)
    features.attrs['vocabsize'] = len(char_to_idx)
    features[...] = input_letters
    features.dims[0].label = 'letters'
    nsamples_train = len(trnMentions); nsamples_dev = len(devMentions);
    split_dict = {
        'train': {'letters': (0, nsamples_train)},
        'dev': {'letters': (nsamples_train, nsamples_train + nsamples_dev)}, 
        'test': {'letters': (nsamples_train + nsamples_dev, totals)}}
    f.attrs['split'] = H5PYDataset.create_split_array(split_dict)
    f.flush()
    f.close()
    logger.info('building letters dataset finished. It saved in: %s', fuelfile)
    if vectorfile is None: 
        return
    embeddings, vectorsize = read_embeddings_vocab(vectorfile, vocab=char_to_idx, num=-1)
    logger.info('size of embedding matrix to save is: (%d, %d)', embeddings.shape[0], embeddings.shape[1])
    with h5py.File(dsdir + "_letters_embeddings.h5py", mode='w') as fp:
        vectors = fp.create_dataset('vectors', compression='gzip',
                                    data=embeddings)
        vectors.attrs['vectorsize'] = vectorsize
示例#16
0
文件: camvid.py 项目: bordesf/fuel
def _initialize_conversion(directory, output_path, image_shape):
    h5file = h5py.File(output_path, mode='w')
    split_dict = {
        'train': {
            'features': (0, TRAIN_STOP),
            'targets': (0, TRAIN_STOP)
        },
        'valid': {
            'features': (TRAIN_STOP, VALID_STOP),
            'targets': (TRAIN_STOP, VALID_STOP)
        },
        'test': {
            'features': (VALID_STOP, NUM_EXAMPLES),
            'targets': (VALID_STOP, NUM_EXAMPLES)
        }
    }
    h5file.attrs['split'] = H5PYDataset.create_split_array(split_dict)

    targets_dataset = h5file.create_dataset('targets',
                                            (NUM_EXAMPLES, ) + image_shape,
                                            dtype='uint8')
    targets_dataset.dims[0].label = 'batch'
    targets_dataset.dims[1].label = 'height'
    targets_dataset.dims[2].label = 'width'

    features_dataset = h5file.create_dataset('features',
                                             (NUM_EXAMPLES, 3) + image_shape,
                                             dtype='uint8')
    features_dataset.dims[0].label = 'batch'
    features_dataset.dims[1].label = 'channel'
    features_dataset.dims[2].label = 'height'
    features_dataset.dims[3].label = 'width'

    return h5file
示例#17
0
def test_celeba():
    data_path = config.data_path
    try:
        config.data_path = '.'
        f = h5py.File('celeba_64.hdf5', 'w')
        f['features'] = numpy.arange(10 * 3 * 64 * 64, dtype='uint8').reshape(
            (10, 3, 64, 64))
        f['targets'] = numpy.arange(10 * 40, dtype='uint8').reshape((10, 40))
        split_dict = {
            'train': {
                'features': (0, 6),
                'targets': (0, 6)
            },
            'valid': {
                'features': (6, 8),
                'targets': (6, 8)
            },
            'test': {
                'features': (8, 10),
                'targets': (8, 10)
            }
        }
        f.attrs['split'] = H5PYDataset.create_split_array(split_dict)
        f.close()
        dataset = CelebA(which_format='64', which_sets=('train', ))
        assert_equal(dataset.filename, 'celeba_64.hdf5')
    finally:
        config.data_path = data_path
        os.remove('celeba_64.hdf5')
示例#18
0
def build_subwords_ds(trnMentions, devMentions, tstMentions, t2idx, dsdir, vectorfile, use_lowercase=False, max_num_words=10, upto=None):
    if vectorfile == None:
        return
    word_to_idx, idx_to_word = build_word_vocab(trnMentions+devMentions+tstMentions) #train for characters because we only use entities names for characters
    logger.info('word vocab size: %d', len(word_to_idx))
    totals = len(trnMentions) + len(devMentions) + len(tstMentions) 
    input_words = numpy.zeros(shape=(totals, max_num_words), dtype='int32')
    for i, men in enumerate(trnMentions + devMentions + tstMentions):
        name = men.name
        words = name.split()
        input_words[i] = get_ngram_seq(word_to_idx, words, max_len=max_num_words)
    logger.info('shape of subwords dataset: %s', input_words.shape)
    hdf5_file = dsdir + '_subwords.h5py'
    f = h5py.File(hdf5_file, mode='w')
    features = f.create_dataset('subwords', input_words.shape, dtype='int32')  # @UndefinedVariable
    
    features.attrs['voc2idx'] = yaml.dump(word_to_idx, default_flow_style=False)
    features.attrs['idx2voc'] = yaml.dump(idx_to_word, default_flow_style=False)
    features.attrs['vocabsize'] = len(word_to_idx)
    features[...] = input_words
    features.dims[0].label = 'words'
    nsamples_train = len(trnMentions); nsamples_dev = len(devMentions);
    split_dict = {
        'train': {'subwords': (0, nsamples_train)},
        'dev': {'subwords': (nsamples_train, nsamples_train + nsamples_dev)}, 
        'test': {'subwords': (nsamples_train + nsamples_dev, totals)}}
    f.attrs['split'] = H5PYDataset.create_split_array(split_dict)
    f.flush();f.close()
    logger.info('Building subwords dataset finished. It saved in: %s', hdf5_file)
    logger.info('writing subword embeddings')
    idx2embeddings, vectorsize = read_embeddings_vocab(vectorfile, vocab=word_to_idx, use_lowercase=use_lowercase, num=upto)
    with h5py.File(dsdir + "_subwords_embeddings.h5py", mode='w') as fp:
        vectors = fp.create_dataset('vectors', compression='gzip',
                                    data=idx2embeddings)
        vectors.attrs['vectorsize'] = vectorsize
示例#19
0
def build_hsNgram_ds(config, trnMentions, devMentions, tstMentions, t2idx, hdf5_file, embpath, emb_list, vectorsize=200, upto=-1):
    print "building hs Ngram datasets: ", emb_list
    for emb_version in emb_list:
        print emb_version
        mypath = os.path.join(embpath, emb_version)
        nsamples_train = len(trnMentions); nsamples_dev = len(devMentions);
        totals = nsamples_train + nsamples_dev + len(tstMentions) 
        vectorsize = get_vec_size(mypath+'/train.txt')
        input_hsngram_matrix = numpy.zeros(shape=(totals, vectorsize), dtype='float32')
        input_hsngram_matrix[0:nsamples_train] = load_embmatirx(mypath+'/train.txt', len(trnMentions), vectorsize, upto)
        input_hsngram_matrix[nsamples_train:nsamples_train+nsamples_dev] = load_embmatirx(mypath+'/dev.txt', len(devMentions), vectorsize, upto)
        input_hsngram_matrix[nsamples_train+nsamples_dev:totals] = load_embmatirx(mypath+'/test.txt', len(tstMentions), vectorsize, upto)
        print input_hsngram_matrix.shape
        srcname = 'hsngram_' + emb_version
        hdf5_file = hdf5_file + '_'+ srcname + '.h5py'
        print hdf5_file
        f = h5py.File(hdf5_file, mode='w')
        features = f.create_dataset(srcname, input_hsngram_matrix.shape, dtype='float32')  # @UndefinedVariable
        features.attrs['vectorsize'] = vectorsize
        features[...] = input_hsngram_matrix
        features.dims[0].label = srcname + '_vector'
        split_dict = {
            'train': {srcname: (0, nsamples_train)},
            'dev': {srcname: (nsamples_train, nsamples_train + nsamples_dev)}, 
            'test': {srcname: (nsamples_train + nsamples_dev, totals)}}    
        f.attrs['split'] = H5PYDataset.create_split_array(split_dict)
        f.flush()
        f.close()
        logger.info('Building hinrich ngram-level embeddings of mentions finished. It saved in: %s', hdf5_file)
示例#20
0
def build_entmentions_ds(config, all_contexts, nsamples_train, nsamples_dev,
                         nsamples_test, nsamples_dev_big):
    logger.info('building entmentions dataset')
    totals = len(all_contexts)
    ctx_dtype = h5py.special_dtype(vlen=np.dtype('uint32'))
    dsdir = config['dsdir']
    ctx_entity_dtype = np.dtype([("id", np.dtype(str), 64),
                                 ("token", np.dtype(str), 64),
                                 ("position", np.dtype('uint8'))])
    with h5py.File(dsdir + "_entmentions.hdf", mode='w') as fp:
        context_entities = fp.create_dataset(
            'entmentions',
            compression='gzip',
            data=np.asarray([(ctx.entity_id, ctx.entity_str, ctx.entity_idx)
                             for ctx in all_contexts],
                            dtype=ctx_entity_dtype))
        split_dict = {
            'train': {
                'entmentions': (0, nsamples_train)
            },
            'dev': {
                'entmentions': (nsamples_train, nsamples_train + nsamples_dev)
            },
            'test': {
                'entmentions': (nsamples_train + nsamples_dev,
                                nsamples_train + nsamples_dev + nsamples_test)
            },
            'devbig': {
                'entmentions':
                (nsamples_train + nsamples_dev + nsamples_test, totals)
            }
        }
        fp.attrs['split'] = H5PYDataset.create_split_array(split_dict)
示例#21
0
def build_typecosine_ds(trnMentions, devMentions, tstMentions, t2idx, hdf5_file, vectorfile, upto=-1):
    (embeddings, voc2idx, vectorsize) = read_embeddings(vectorfile, upto)
    totals = len(trnMentions) + len(devMentions) + len(tstMentions) 
    input_entvec = numpy.zeros(shape=(totals, vectorsize), dtype='float32')
    for i, men in enumerate(trnMentions + devMentions + tstMentions):
        mye = men.entityId
        entvec = numpy.zeros(vectorsize)
        if mye in voc2idx:
            entvec = embeddings[voc2idx[mye]]
        input_entvec[i] = entvec
    typevecmatrix = buildtypevecmatrix(t2idx, embeddings, vectorsize, voc2idx) # a matrix with size: 102 * dim
    ent_types_cosin_matrix = buildcosinematrix(input_entvec, typevecmatrix)
    logger.info(ent_types_cosin_matrix.shape)
    
    hdf5_file += '_tc.h5py'
    f = h5py.File(hdf5_file, mode='w')
    features = f.create_dataset('tc', ent_types_cosin_matrix.shape, dtype='float32')  # @UndefinedVariable
    features.attrs['vectorsize'] = ent_types_cosin_matrix.shape[1]
    features[...] = ent_types_cosin_matrix
    features.dims[0].label = 'types_ent_cosine'
    nsamples_train = len(trnMentions); nsamples_dev = len(devMentions);
    split_dict = {
        'train': {'tc': (0, nsamples_train)},
        'dev': {'tc': (nsamples_train, nsamples_train + nsamples_dev)}, 
        'test': {'tc': (nsamples_train + nsamples_dev, totals)}}    
    f.attrs['split'] = H5PYDataset.create_split_array(split_dict)
    f.flush()
    f.close()
    logger.info('Building types-ent cosine (tc) dataset finished. It saved in: %s', hdf5_file)
示例#22
0
def build_targets_ds(trnMentions, devMentions, tstMentions, t2idx, dsdir):
    totals = len(trnMentions) + len(devMentions) + len(tstMentions)
    targets_m = numpy.zeros(shape=(totals, len(t2idx)), dtype='int32')
    for i, men in enumerate(trnMentions + devMentions + tstMentions):
        types_idx = [t2idx[t] for t in men.alltypes]
        targets_m[i] = cmn.convertTargetsToBinVec(types_idx, len(t2idx))
    hdf5_file = dsdir + '_targets.h5py'
    f = h5py.File(hdf5_file, mode='w')
    targets = f.create_dataset('targets', targets_m.shape, dtype='int32')
    targets.attrs['type_to_ix'] = yaml.dump(t2idx)
    targets[...] = targets_m
    targets.dims[0].label = 'all_types'
    nsamples_train = len(trnMentions)
    nsamples_dev = len(devMentions)
    split_dict = {
        'train': {
            'targets': (0, nsamples_train)
        },
        'dev': {
            'targets': (nsamples_train, nsamples_train + nsamples_dev)
        },
        'test': {
            'targets': (nsamples_train + nsamples_dev, totals)
        }
    }
    f.attrs['split'] = H5PYDataset.create_split_array(split_dict)
    f.flush()
    f.close()
示例#23
0
def load_unlabeled_indices_from_hdf5(hdf5_file):
    data = H5PYDataset(hdf5_file,
                       which_sets=("data", ),
                       sources=("unlabeled_indices", ),
                       load_in_memory=True)
    unlabeled_indices, = data.data_sources
    return unlabeled_indices.astype("int")
示例#24
0
def build_targets_ds(config, all_contexts, nsamples_train, nsamples_dev,
                     nsamples_test, nsamples_dev_big):
    logger.info("building targets dataset")
    entity_types = list(load_types(config['typefile']))
    (t2idx, _) = cmn.loadtypes(config['typefile'])
    totals = len(all_contexts)
    targets_m = numpy.zeros(shape=(totals, len(t2idx)), dtype='int32')
    for i, ctx in enumerate(all_contexts):
        types_idx = [t2idx[t] for t in ctx.all_types if t in t2idx]
        targets_m[i] = cmn.convertTargetsToBinVec(types_idx, len(t2idx))
    dsdir = config['dsdir']
    fp = h5py.File(dsdir + '_targets.hdf', mode='w')
    targets = fp.create_dataset('targets', targets_m.shape, dtype='int32')
    targets.attrs['type_to_ix'] = yaml.dump(t2idx)
    targets[...] = targets_m
    targets.dims[0].label = 'all_types'
    split_dict = {
        'train': {
            'targets': (0, nsamples_train)
        },
        'dev': {
            'targets': (nsamples_train, nsamples_train + nsamples_dev)
        },
        'test': {
            'targets': (nsamples_train + nsamples_dev,
                        nsamples_train + nsamples_dev + nsamples_test)
        },
        'devbig': {
            'targets': (nsamples_train + nsamples_dev + nsamples_test, totals)
        }
    }
    fp.attrs['split'] = H5PYDataset.create_split_array(split_dict)
    fp.flush()
    fp.close()
 def part(self, part):
     if part not in self._dataset_cache:
         self._dataset_cache[part] = H5PYDataset(h5py.File(
             self._part_paths[part], "r"),
                                                 which_sets=('all', ),
                                                 sources=self._sources,
                                                 load_in_memory=True)
     return self._dataset_cache[part]
示例#26
0
def build_hsNgram_ds(config,
                     trnMentions,
                     devMentions,
                     tstMentions,
                     t2idx,
                     hdf5_file,
                     embpath,
                     emb_list,
                     vectorsize=200,
                     upto=-1):
    print "building hs Ngram datasets: ", emb_list
    for emb_version in emb_list:
        print emb_version
        mypath = os.path.join(embpath, emb_version)
        nsamples_train = len(trnMentions)
        nsamples_dev = len(devMentions)
        totals = nsamples_train + nsamples_dev + len(tstMentions)
        vectorsize = get_vec_size(mypath + '/train.txt')
        input_hsngram_matrix = numpy.zeros(shape=(totals, vectorsize),
                                           dtype='float32')
        input_hsngram_matrix[0:nsamples_train] = load_embmatirx(
            mypath + '/train.txt', len(trnMentions), vectorsize, upto)
        input_hsngram_matrix[nsamples_train:nsamples_train +
                             nsamples_dev] = load_embmatirx(
                                 mypath + '/dev.txt', len(devMentions),
                                 vectorsize, upto)
        input_hsngram_matrix[nsamples_train +
                             nsamples_dev:totals] = load_embmatirx(
                                 mypath + '/test.txt', len(tstMentions),
                                 vectorsize, upto)
        print input_hsngram_matrix.shape
        srcname = 'hsngram_' + emb_version
        hdf5_file = hdf5_file + '_' + srcname + '.h5py'
        print hdf5_file
        f = h5py.File(hdf5_file, mode='w')
        features = f.create_dataset(srcname,
                                    input_hsngram_matrix.shape,
                                    dtype='float32')  # @UndefinedVariable
        features.attrs['vectorsize'] = vectorsize
        features[...] = input_hsngram_matrix
        features.dims[0].label = srcname + '_vector'
        split_dict = {
            'train': {
                srcname: (0, nsamples_train)
            },
            'dev': {
                srcname: (nsamples_train, nsamples_train + nsamples_dev)
            },
            'test': {
                srcname: (nsamples_train + nsamples_dev, totals)
            }
        }
        f.attrs['split'] = H5PYDataset.create_split_array(split_dict)
        f.flush()
        f.close()
        logger.info(
            'Building hinrich ngram-level embeddings of mentions finished. It saved in: %s',
            hdf5_file)
示例#27
0
def fill_hdf5_file(h5file, data):
    """Fills an HDF5 file in a H5PYDataset-compatible manner.

    Parameters
    ----------
    h5file : :class:`h5py.File`
        File handle for an HDF5 file.
    data : tuple of tuple
        One element per split/source pair. Each element consists of a
        tuple of (split_name, source_name, data_array, comment), where

        * 'split_name' is a string identifier for the split name
        * 'source_name' is a string identifier for the source name
        * 'data_array' is a :class:`numpy.ndarray` containing the data
          for this split/source pair
        * 'comment' is a comment string for the split/source pair

        The 'comment' element can optionally be omitted.

    """
    # Check that all sources for a split have the same length
    split_names = set(split_tuple[0] for split_tuple in data)
    for name in split_names:
        lengths = [
            len(split_tuple[2]) for split_tuple in data
            if split_tuple[0] == name
        ]
        if not all(le == lengths[0] for le in lengths):
            raise ValueError("split '{}' has sources that ".format(name) +
                             "vary in length")

    # Initialize split dictionary
    split_dict = dict([(split_name, {}) for split_name in split_names])

    # Compute total source lengths and check that splits have the same dtype
    # across a source
    source_names = set(split_tuple[1] for split_tuple in data)
    for name in source_names:
        splits = [s for s in data if s[1] == name]
        indices = numpy.cumsum([0] + [len(s[2]) for s in splits])
        if not all(s[2].dtype == splits[0][2].dtype for s in splits):
            raise ValueError("source '{}' has splits that ".format(name) +
                             "vary in dtype")
        if not all(s[2].shape[1:] == splits[0][2].shape[1:] for s in splits):
            raise ValueError("source '{}' has splits that ".format(name) +
                             "vary in shapes")
        dataset = h5file.create_dataset(
            name, (sum(len(s[2]) for s in splits), ) + splits[0][2].shape[1:],
            dtype=splits[0][2].dtype)
        dataset[...] = numpy.concatenate([s[2] for s in splits], axis=0)
        for i, j, s in zip(indices[:-1], indices[1:], splits):
            if len(s) == 4:
                split_dict[s[0]][name] = (i, j, None, s[3])
            else:
                split_dict[s[0]][name] = (i, j)
    h5file.attrs['split'] = H5PYDataset.create_split_array(split_dict)
示例#28
0
def build_letters_ds(trnMentions,
                     devMentions,
                     tstMentions,
                     t2idx,
                     dsdir,
                     vectorfile=None,
                     max_len_name=30):
    char_to_idx, idx_to_char = build_char_vocab(
        trnMentions
    )  #train for characters because we only use entities names for characters
    totals = len(trnMentions) + len(devMentions) + len(tstMentions)
    input_letters = numpy.zeros(shape=(totals, max_len_name), dtype='int32')
    for i, men in enumerate(trnMentions + devMentions + tstMentions):
        name = men.name
        input_letters[i] = get_ngram_seq(char_to_idx, name, max_len_name)
    print input_letters.shape
    fuelfile = dsdir + '_letters.h5py'
    f = h5py.File(fuelfile, mode='w')
    features = f.create_dataset('letters', input_letters.shape,
                                dtype='int32')  # @UndefinedVariable
    features.attrs['voc2idx'] = yaml.dump(char_to_idx,
                                          default_flow_style=False)
    features.attrs['idx2voc'] = yaml.dump(idx_to_char,
                                          default_flow_style=False)
    features.attrs['vocabsize'] = len(char_to_idx)
    features[...] = input_letters
    features.dims[0].label = 'letters'
    nsamples_train = len(trnMentions)
    nsamples_dev = len(devMentions)
    split_dict = {
        'train': {
            'letters': (0, nsamples_train)
        },
        'dev': {
            'letters': (nsamples_train, nsamples_train + nsamples_dev)
        },
        'test': {
            'letters': (nsamples_train + nsamples_dev, totals)
        }
    }
    f.attrs['split'] = H5PYDataset.create_split_array(split_dict)
    f.flush()
    f.close()
    logger.info('building letters dataset finished. It saved in: %s', fuelfile)
    if vectorfile is None:
        return
    embeddings, vectorsize = read_embeddings_vocab(vectorfile,
                                                   vocab=char_to_idx,
                                                   num=-1)
    logger.info('size of embedding matrix to save is: (%d, %d)',
                embeddings.shape[0], embeddings.shape[1])
    with h5py.File(dsdir + "_letters_embeddings.h5py", mode='w') as fp:
        vectors = fp.create_dataset('vectors',
                                    compression='gzip',
                                    data=embeddings)
        vectors.attrs['vectorsize'] = vectorsize
示例#29
0
 def __init__(self, split='train', **kwargs):
     path = find_in_data_path(self._filename)
     self.split = split
     self.train = H5PYDataset(file_or_path=path, which_sets=['train'])
     self.train_labels = H5PYDataset(
         file_or_path=path,
         which_sets=['train'],
         sources=['targets'],
         load_in_memory=True).data_sources[0].ravel()
     self.test = H5PYDataset(file_or_path=path, which_sets=['test'])
     self.test_labels = H5PYDataset(
         file_or_path=path,
         which_sets=['test'],
         sources=['targets'],
         load_in_memory=True).data_sources[0].ravel()
     self.train_handle = self.train.open()
     self.test_hanle = self.test.open()
     self.ntest = self.test.num_examples
     self.ntrain = self.train.num_examples
示例#30
0
    def save_decision_boundary(self, w, b):
        """
        :return:
        """
        w = to_vector(w).T
        if os.path.isfile(self.save_path_boundaries):
            with h5py.File(self.save_path_boundaries, 'r+') as hf:
                w_dataset = hf.get('w')
                already_in_w_ds = w_dataset.shape[0]
                w_dataset.resize(already_in_w_ds + w.shape[0], axis=0)
                w_dataset[already_in_w_ds:already_in_w_ds + w.shape[0], :] = w

                b_dataset = hf.get('b')
                already_in_b_ds = b_dataset.len()
                b_dataset.resize(already_in_b_ds + 1, axis=0)
                b_dataset[already_in_b_ds:already_in_b_ds + 1] = b

                split_dict = {
                    "data": {
                        "w": (0, already_in_w_ds + w.shape[0]),
                        "b": (0, already_in_b_ds + 1)
                    }
                }
                hf.attrs["split"] = H5PYDataset.create_split_array(split_dict)
        else:
            # HDF5 query line save file does not exist yet!
            f = h5py.File(self.save_path_boundaries, "w")

            w_dataset = f.create_dataset('w',
                                         w.shape,
                                         maxshape=(None, w.shape[1]),
                                         dtype="float32")
            w_dataset[...] = w

            b_dataset = f.create_dataset('b', (1, ),
                                         maxshape=(None, ),
                                         dtype="float32")
            b_dataset[...] = b

            split_dict = {"data": {"w": (0, w.shape[0]), "b": (0, 1)}}
            f.attrs['split'] = H5PYDataset.create_split_array(split_dict)
            f.flush()
            f.close()
示例#31
0
def fill_hdf5_file(h5file, data):
    """Fills an HDF5 file in a H5PYDataset-compatible manner.

    Parameters
    ----------
    h5file : :class:`h5py.File`
        File handle for an HDF5 file.
    data : tuple of tuple
        One element per split/source pair. Each element consists of a
        tuple of (split_name, source_name, data_array, comment), where

        * 'split_name' is a string identifier for the split name
        * 'source_name' is a string identifier for the source name
        * 'data_array' is a :class:`numpy.ndarray` containing the data
          for this split/source pair
        * 'comment' is a comment string for the split/source pair

        The 'comment' element can optionally be omitted.

    """
    # Check that all sources for a split have the same length
    split_names = set(split_tuple[0] for split_tuple in data)
    for name in split_names:
        lengths = [len(split_tuple[2]) for split_tuple in data
                   if split_tuple[0] == name]
        if not all(l == lengths[0] for l in lengths):
            raise ValueError("split '{}' has sources that ".format(name) +
                             "vary in length")

    # Initialize split dictionary
    split_dict = dict([(split_name, {}) for split_name in split_names])

    # Compute total source lengths and check that splits have the same dtype
    # across a source
    source_names = set(split_tuple[1] for split_tuple in data)
    for name in source_names:
        splits = [s for s in data if s[1] == name]
        indices = numpy.cumsum([0] + [len(s[2]) for s in splits])
        if not all(s[2].dtype == splits[0][2].dtype for s in splits):
            raise ValueError("source '{}' has splits that ".format(name) +
                             "vary in dtype")
        if not all(s[2].shape[1:] == splits[0][2].shape[1:] for s in splits):
            raise ValueError("source '{}' has splits that ".format(name) +
                             "vary in shapes")
        dataset = h5file.create_dataset(
            name, (sum(len(s[2]) for s in splits),) + splits[0][2].shape[1:],
            dtype=splits[0][2].dtype)
        dataset[...] = numpy.concatenate([s[2] for s in splits], axis=0)
        for i, j, s in zip(indices[:-1], indices[1:], splits):
            if len(s) == 4:
                split_dict[s[0]][name] = (i, j, None, s[3])
            else:
                split_dict[s[0]][name] = (i, j)
    h5file.attrs['split'] = H5PYDataset.create_split_array(split_dict)
示例#32
0
def make_fuel_dataset(file_name, training_set_data, testing_set_data):
    """

    :param file_name:
    :type file_name:
    :param training_set_data:
    :type training_set_data: list[dict[str, numpy.core.multiarray.ndarray]]
    :param testing_set_data:
    :type testing_set_data: list[dict[str, numpy.core.multiarray.ndarray]]
    :return:
    :rtype:
    """
    f = h5py.File(file_name, mode='w')

    nb_training_examples = len(training_set_data)
    nb_testing_examples = len(testing_set_data)
    nb_examples = nb_training_examples + nb_testing_examples

    ts = [
          ('audio_features', 'audio_features', 'float32'),
          ('targets_weak_alarm', 'classes_alarm_weak', 'uint8'),
          ('targets_weak_vehicle', 'classes_vehicle_weak', 'uint8'),
          ('targets_strong_alarm', 'classes_alarm_strong', 'uint8'),
          ('targets_strong_vehicle', 'classes_vehicle_strong', 'uint8')
          ]

    shape_labels = ['batch'.encode('utf8'), 'time_frames'.encode('utf8'), 'features'.encode('utf8')]
    datasets = []
    datasets_shapes = []
    datasets_shapes_labels = []
    for t in ts:
        datasets.append(f.create_dataset(t[0], (nb_examples, ), dtype=h5py.special_dtype(vlen=np.dtype(t[2]))))
        datasets[-1][...] = [entry[t[1]].flatten() for entry in training_set_data + testing_set_data]

        datasets_shapes.append(f.create_dataset('{}_shapes'.format(t[0]), (nb_examples, 3), dtype='int32'))
        datasets_shapes[-1][...] = np.array([entry[t[1]].reshape((1, ) + entry[t[1]].shape).shape
                                             for entry in training_set_data + testing_set_data])
        datasets[-1].dims.create_scale(datasets_shapes[-1], 'shapes')
        datasets[-1].dims[0].attach_scale(datasets_shapes[-1])

        datasets_shapes_labels.append(f.create_dataset('{}_shape_labels'.format(t[0]), (3, ), dtype='S11'))
        datasets_shapes_labels[-1][...] = shape_labels
        datasets[-1].dims.create_scale(datasets_shapes_labels[-1], 'shape_labels')
        datasets[-1].dims[0].attach_scale(datasets_shapes_labels[-1])

    split_dict = {
        'train': {t[0]: (0, nb_training_examples) for t in ts},
        'test': {t[0]: (nb_training_examples, nb_examples) for t in ts}
    }

    f.attrs['split'] = H5PYDataset.create_split_array(split_dict)
    f.flush()
    f.close()
示例#33
0
 def __init__(self, h5_file, for_training=True):
     """
     Args:
         h5_file (string): Path to the h5 file
         for_training (bool): True if you want the training dataset, otherwise you get the testing split
     """
     super(MujocoTraintestPusherDataset, self).__init__()
     self.f = h5py.File(h5_file, "r")
     phase = "train"
     if not for_training:
         phase = "valid"
     self.f = H5PYDataset(h5_file, which_sets=(phase, ))
示例#34
0
文件: base.py 项目: yccai/scikit-chem
    def save_splits(self):
        """ Save the splits to the data file. """

        logger.info('Producing dataset splits...')
        for split in self.splits:
            split.save()
        split_dict = {split.name: split.to_dict() for split in self.splits}
        splits = H5PYDataset.create_split_array(split_dict)
        logger.debug('split: %s', splits)
        logger.info('Saving splits...')
        with warnings.catch_warnings():
            warnings.simplefilter('ignore')
            self.data_file.attrs['split'] = splits
示例#35
0
def get_stream(hdf5_file, which_set, batch_size=None):
    dataset = H5PYDataset(hdf5_file,
                          which_sets=(which_set, ),
                          load_in_memory=True)
    if batch_size == None:
        batch_size = dataset.num_examples
    stream = DataStream(dataset=dataset,
                        iteration_scheme=ShuffledScheme(
                            examples=dataset.num_examples,
                            batch_size=batch_size))
    # Required because Recurrent bricks receive as input [sequence, batch,
    # features]
    return Mapping(stream, transpose_stream)
示例#36
0
def get_all_data_inorder(filename, batch_size):
    sources = ('features', 'targets')

    dataset_fname = find_in_data_path(filename + '.hdf5')
    data_all = H5PYDataset(dataset_fname,
                           which_sets=['train', 'valid', 'test'],
                           sources=sources)
    data_all.default_transformers = uint8_pixels_to_floatX(('features', ))
    main_stream = DataStream.default_stream(dataset=data_all,
                                            iteration_scheme=SequentialScheme(
                                                data_all.num_examples,
                                                batch_size))
    color_stream = Colorize(main_stream, which_sources=('features', ))
    return data_all.num_examples, color_stream
示例#37
0
def build_typecosine_ds(trnMentions,
                        devMentions,
                        tstMentions,
                        t2idx,
                        hdf5_file,
                        vectorfile,
                        upto=-1):
    (embeddings, voc2idx, vectorsize) = read_embeddings(vectorfile, upto)
    totals = len(trnMentions) + len(devMentions) + len(tstMentions)
    input_entvec = numpy.zeros(shape=(totals, vectorsize), dtype='float32')
    for i, men in enumerate(trnMentions + devMentions + tstMentions):
        mye = men.entityId
        entvec = numpy.zeros(vectorsize)
        if mye in voc2idx:
            entvec = embeddings[voc2idx[mye]]
        input_entvec[i] = entvec
    typevecmatrix = buildtypevecmatrix(
        t2idx, embeddings, vectorsize,
        voc2idx)  # a matrix with size: 102 * dim
    ent_types_cosin_matrix = buildcosinematrix(input_entvec, typevecmatrix)
    logger.info(ent_types_cosin_matrix.shape)

    hdf5_file += '_tc.h5py'
    f = h5py.File(hdf5_file, mode='w')
    features = f.create_dataset('tc',
                                ent_types_cosin_matrix.shape,
                                dtype='float32')  # @UndefinedVariable
    features.attrs['vectorsize'] = ent_types_cosin_matrix.shape[1]
    features[...] = ent_types_cosin_matrix
    features.dims[0].label = 'types_ent_cosine'
    nsamples_train = len(trnMentions)
    nsamples_dev = len(devMentions)
    split_dict = {
        'train': {
            'tc': (0, nsamples_train)
        },
        'dev': {
            'tc': (nsamples_train, nsamples_train + nsamples_dev)
        },
        'test': {
            'tc': (nsamples_train + nsamples_dev, totals)
        }
    }
    f.attrs['split'] = H5PYDataset.create_split_array(split_dict)
    f.flush()
    f.close()
    logger.info(
        'Building types-ent cosine (tc) dataset finished. It saved in: %s',
        hdf5_file)
示例#38
0
def test_mem_loader(nevt=1000, load_in_memory=True):
    fname = 'nukecc_fuel_me1B.hdf5'

    train_set = H5PYDataset(fname, which_sets=('train', ), sources=['hits'])
    nexamp = train_set.num_examples

    if nevt > nexamp:
        nevt = nexamp

        train_set = H5PYDataset(fname,
                                subset=slice(0, nevt),
                                which_sets=('train', ),
                                sources=['hits'],
                                load_in_memory=load_in_memory)
        handle = train_set.open()
        data = train_set.get_data(handle, slice(0, nevt))
        length = np.shape(data[0])[0]
        if length != nevt:
            raise
        counter = 0
        for _ in range(length):
            counter += data[0][0]

        train_set.close(handle)
示例#39
0
def make_lr_fuel_file(outfile, inda, indb, indc, X, y):
    """
    Makes a FUEL dataset that combines both left and right features.
    :param outfile:
    :param inda:
    :param indb:
    :param indc:
    :param X:
    :param y:
    :return:
    """
    # Make the pytables table:
    f = h5py.File(outfile, mode='w')
    targets = f.create_dataset('targets', y.shape, dtype='int8')
    l_features = f.create_dataset('l_features', X['l'].shape, dtype='int8')
    r_features = f.create_dataset('r_features', X['r'].shape, dtype='int8')

    # Load the data into it:
    l_features[...] = X['l']
    r_features[...] = X['r']
    targets[...] = y

    # Label the axis:
    targets.dims[0].label = 'sample'
    targets.dims[1].label = 'class'
    l_features.dims[0].label = 'sample'
    l_features.dims[1].label = 'feature'
    r_features.dims[0].label = 'sample'
    r_features.dims[1].label = 'feature'

    # Make a "splits" dictionary as required by Fuel
    split_dict = {
        'train': {'l_features': (0, inda),
                  'r_features': (0, inda),
                  'targets': (0, inda)},
        'valid': {'l_features': (inda, inda + indb),
                  'r_features': (inda, inda + indb),
                  'targets': (inda, inda + indb)},
        'test': {'l_features': (inda + indb, inda + indb + indc),
                 'r_features': (inda + indb, inda + indb + indc),
                 'targets': (inda + indb, inda + indb + indc)},
    }

    f.attrs['split'] = H5PYDataset.create_split_array(split_dict)

    # Save this new dataset to file
    f.flush()
    f.close()
示例#40
0
文件: test_svhn.py 项目: Afrik/fuel
def test_svhn():
    data_path = config.data_path
    try:
        config.data_path = '.'
        f = h5py.File('svhn_format_2.hdf5', 'w')
        f['features'] = numpy.arange(100, dtype='uint8').reshape((10, 10))
        f['targets'] = numpy.arange(10, dtype='uint8').reshape((10, 1))
        split_dict = {'train': {'features': (0, 8), 'targets': (0, 8)},
                      'test': {'features': (8, 10), 'targets': (8, 10)}}
        f.attrs['split'] = H5PYDataset.create_split_array(split_dict)
        f.close()
        dataset = SVHN(which_format=2, which_sets=('train',))
        assert_equal(dataset.filename, 'svhn_format_2.hdf5')
    finally:
        config.data_path = data_path
        os.remove('svhn_format_2.hdf5')
示例#41
0
def build_desc_features_ds(trnMentions, devMentions, tstMentions, ent2tfidf_features_path, t2idx, dsdir, vectorfile, use_lowercase=True, upto=None):
    if ent2tfidf_features_path == None:
        print "Warning: ignoring tfidf features building..."
        return
    ent2features = load_ent2features(ent2tfidf_features_path)
    word_to_idx, idx_to_word = build_voc_from_features(ent2features)
    logger.info('tfidf desc features vocab size: %d', len(word_to_idx))
    totals = len(trnMentions) + len(devMentions) + len(tstMentions) 
    input_features = numpy.zeros(shape=(totals, len(ent2features.values()[0])), dtype='int32')
    ent_no_emb = 0
    for i, men in enumerate(trnMentions + devMentions + tstMentions):
        if men.entityId not in ent2features:
            ent_no_emb += 1
            continue
        features = ent2features[men.entityId]
        input_features[i] = get_ngram_seq(word_to_idx, features, max_len=input_features.shape[1])
    logger.info('shape of tfidf input dataset: %s', input_features.shape)
    logger.info('number of entities without embeddings: %d', ent_no_emb)
    hdf5_file = dsdir + '_desc_features.h5py'
    f = h5py.File(hdf5_file, mode='w')
    features = f.create_dataset('desc_features', input_features.shape, dtype='int32')  # @UndefinedVariable
    
    features.attrs['voc2idx'] = yaml.dump(word_to_idx, default_flow_style=False)
    features.attrs['idx2voc'] = yaml.dump(idx_to_word, default_flow_style=False)
    features.attrs['vocabsize'] = len(word_to_idx)
    features[...] = input_features
    features.dims[0].label = 'description_features'
    nsamples_train = len(trnMentions); nsamples_dev = len(devMentions);
    split_dict = {
        'train': {'desc_features': (0, nsamples_train)},
        'dev': {'desc_features': (nsamples_train, nsamples_train + nsamples_dev)}, 
        'test': {'desc_features': (nsamples_train + nsamples_dev, totals)}}
    f.attrs['split'] = H5PYDataset.create_split_array(split_dict)
    f.flush();f.close()
    
    logger.info('Building desc_features dataset finished. It saved in: %s', hdf5_file)
    logger.info('writing word embeddings')
    idx2embeddings, vectorsize = read_embeddings_vocab(vectorfile, vocab=word_to_idx, use_lowercase=use_lowercase, num=upto)
    print "embeddings shape: ", idx2embeddings.shape
    with h5py.File(dsdir + "_desc_features_embeddings.h5py", mode='w') as fp:
        vectors = fp.create_dataset('vectors', compression='gzip',
                                    data=idx2embeddings)
        vectors.attrs['vectorsize'] = vectorsize
示例#42
0
文件: test_celeba.py 项目: Afrik/fuel
def test_celeba():
    data_path = config.data_path
    try:
        config.data_path = '.'
        f = h5py.File('celeba_64.hdf5', 'w')
        f['features'] = numpy.arange(
            10 * 3 * 64 * 64, dtype='uint8').reshape((10, 3, 64, 64))
        f['targets'] = numpy.arange(
            10 * 40, dtype='uint8').reshape((10, 40))
        split_dict = {'train': {'features': (0, 6), 'targets': (0, 6)},
                      'valid': {'features': (6, 8), 'targets': (6, 8)},
                      'test': {'features': (8, 10), 'targets': (8, 10)}}
        f.attrs['split'] = H5PYDataset.create_split_array(split_dict)
        f.close()
        dataset = CelebA(which_format='64', which_sets=('train',))
        assert_equal(dataset.filename, 'celeba_64.hdf5')
    finally:
        config.data_path = data_path
        os.remove('celeba_64.hdf5')
示例#43
0
def make_one_sided_fuel_file(outfile, inda, indb, indc, X, y, side):
    """
    Makes a dataset that includes only a single side of features.
    :param outfile:
    :param inda:
    :param indb:
    :param indc:
    :param X:
    :param y:
    :param side:
    :return:
    """
    # Make the pytables table:
    f = h5py.File(outfile, mode='w')
    targets = f.create_dataset('targets', y.shape, dtype='int8')
    features = f.create_dataset('{}_features'.format(side), X.shape, dtype='int8')

    # Load the data into it:
    features[...] = X
    targets[...] = y

    # Label the axis:
    targets.dims[0].label = 'sample'
    targets.dims[1].label = 'class'
    features.dims[0].label = 'sample'
    features.dims[1].label = 'feature'

    # Make a "splits" dictionary as required by Fuel
    split_dict = {
        'train': {'{}_features'.format(side): (0, inda),
                  'targets': (0, inda)},
        'valid': {'{}_features'.format(side): (inda, inda + indb),
                  'targets': (inda, inda + indb)},
        'test': {'{}_features'.format(side): (inda + indb, inda + indb + indc),
                 'targets': (inda + indb, inda + indb + indc)},
    }

    f.attrs['split'] = H5PYDataset.create_split_array(split_dict)

    # Save this new dataset to file
    f.flush()
    f.close()
示例#44
0
def build_targets_ds(trnMentions, devMentions, tstMentions, t2idx, dsdir):
    totals = len(trnMentions) + len(devMentions) + len(tstMentions) 
    targets_m = numpy.zeros(shape=(totals, len(t2idx)), dtype='int32')
    for i, men in enumerate(trnMentions + devMentions + tstMentions):
        types_idx = [t2idx[t] for t in men.alltypes] 
        targets_m[i] = cmn.convertTargetsToBinVec(types_idx, len(t2idx))
    hdf5_file = dsdir + '_targets.h5py'
    f = h5py.File(hdf5_file, mode='w')
    targets = f.create_dataset('targets', targets_m.shape, dtype='int32')
    targets.attrs['type_to_ix'] = yaml.dump(t2idx)
    targets[...] = targets_m
    targets.dims[0].label = 'all_types'
    nsamples_train = len(trnMentions); nsamples_dev = len(devMentions);
    split_dict = {
        'train': {'targets': (0, nsamples_train)},
        'dev': {'targets': (nsamples_train, nsamples_train + nsamples_dev)},
        'test': {'targets': (nsamples_train + nsamples_dev, totals)}}    
    f.attrs['split'] = H5PYDataset.create_split_array(split_dict)
    f.flush()
    f.close()
示例#45
0
def build_type_words_cosine_ds(trnMentions, devMentions, tstMentions, t2idx, dsdir, vectorfile, upto=-1, max_num_words=4):
    word_to_idx, idx_to_word = build_word_vocab(trnMentions+devMentions+tstMentions) #train for characters because we only use entities names for characters
    logger.info('word vocab size: %d', len(word_to_idx))
    totals = len(trnMentions) + len(devMentions) + len(tstMentions) 
    
    idx2embeddings, vectorsize = read_embeddings_vocab(vectorfile, vocab=word_to_idx, num=upto)
    
    input_avg = numpy.zeros(shape=(totals, vectorsize), dtype='float32')
    for i, men in enumerate(trnMentions + devMentions + tstMentions):
        name = men.name
        words = name.split()
        seq_words = get_ngram_seq(word_to_idx, words, max_len=max_num_words)
        avgvec = numpy.zeros(shape=(vectorsize))
        for ii in seq_words:
            avgvec += idx2embeddings[ii]
        avgvec /= len(seq_words)
        input_avg[i] = avgvec
    
    (embeddings, voc2idx, vectorsize) = read_embeddings(vectorfile, upto)
    typevecmatrix = buildtypevecmatrix(t2idx, embeddings, vectorsize, voc2idx) # a matrix with size: 102 * dim
    words_types_cosin_matrix = buildcosinematrix(input_avg, typevecmatrix)
    logger.info(words_types_cosin_matrix.shape)
     
    dsdir += '_tcwords.h5py'
    f = h5py.File(dsdir, mode='w')
    features = f.create_dataset('tcwords', words_types_cosin_matrix.shape, dtype='float32')  # @UndefinedVariable
    features.attrs['vectorsize'] = words_types_cosin_matrix.shape[1]
    features[...] = words_types_cosin_matrix
    features.dims[0].label = 'words_types_cosine'
    nsamples_train = len(trnMentions); nsamples_dev = len(devMentions);
    split_dict = {
        'train': {'tcwords': (0, nsamples_train)},
        'dev': {'tcwords': (nsamples_train, nsamples_train + nsamples_dev)}, 
        'test': {'tcwords': (nsamples_train + nsamples_dev, totals)}}    
    f.attrs['split'] = H5PYDataset.create_split_array(split_dict)
    f.flush()
    f.close()
    logger.info('Building types-words cosine (tcwords) dataset finished. It saved in: %s', dsdir)
示例#46
0
def build_ngram_ds(trnMentions, devMentions, tstMentions, t2idx, dsdir, vectorfile, ngram, max_num_ngrams=98, upto=-1):
    ngram_to_idx, idx_to_word, name2ngrams = build_ngram_vocab(trnMentions+devMentions+tstMentions,ngram=ngram, MIN_FREQ=5) #train for characters because we only use entities names for characters
    logger.info('ngram%d vocab size: %d', ngram, len(ngram_to_idx))
    totals = len(trnMentions) + len(devMentions) + len(tstMentions) 
    input_words = numpy.zeros(shape=(totals, max_num_ngrams), dtype='int32')
    for i, men in enumerate(trnMentions + devMentions + tstMentions):
        name = men.name
        ngrams = name2ngrams[name]
        input_words[i] = get_ngram_seq(ngram_to_idx, ngrams, max_len=max_num_ngrams)
    print input_words.shape
    ngram_label = 'ngrams' + str(ngram)
    hdf5_file = dsdir + '_ngrams'+str(ngram)+'.h5py'
    f = h5py.File(hdf5_file, mode='w')
    features = f.create_dataset(ngram_label, input_words.shape, dtype='int32')  # @UndefinedVariable
    
    features.attrs['voc2idx'] = yaml.dump(ngram_to_idx, default_flow_style=False)
    features.attrs['idx2voc'] = yaml.dump(idx_to_word, default_flow_style=False)
    features.attrs['vocabsize'] = len(ngram_to_idx)
    features[...] = input_words
    features.dims[0].label = ngram_label
    nsamples_train = len(trnMentions); nsamples_dev = len(devMentions);
    split_dict = {
        'train': {ngram_label: (0, nsamples_train)},
        'dev': {ngram_label: (nsamples_train, nsamples_train + nsamples_dev)}, 
        'test': {ngram_label: (nsamples_train + nsamples_dev, totals)}}
    f.attrs['split'] = H5PYDataset.create_split_array(split_dict)
    f.flush();f.close()
    logger.info('Building ngram%d dataset finished. It saved in: %s', ngram, hdf5_file)
    if vectorfile is None or vectorfile == '': 
        return
    logger.info('Now, writing ngram embeddings')
    embeddings, vectorsize = read_embeddings_vocab(vectorfile, vocab=ngram_to_idx, num=upto)
    logger.info('size of embedding matrix to save is: (%d, %d)', embeddings.shape[0], embeddings.shape[1])
    with h5py.File(dsdir + "_" + ngram_label + "_embeddings.h5py", mode='w') as fp:
        vectors = fp.create_dataset('vectors', compression='gzip',
                                    data=embeddings)
        vectors.attrs['vectorsize'] = vectorsize        
示例#47
0
def convert_svhn_format_1(directory, output_directory,
                          output_filename='svhn_format_1.hdf5'):
    """Converts the SVHN dataset (format 1) to HDF5.

    This method assumes the existence of the files
    `{train,test,extra}.tar.gz`, which are accessible through the
    official website [SVHNSITE].

    .. [SVHNSITE] http://ufldl.stanford.edu/housenumbers/

    Parameters
    ----------
    directory : str
        Directory in which input files reside.
    output_directory : str
        Directory in which to save the converted dataset.
    output_filename : str, optional
        Name of the saved dataset. Defaults to 'svhn_format_1.hdf5'.

    Returns
    -------
    output_paths : tuple of str
        Single-element tuple containing the path to the converted dataset.

    """
    try:
        output_path = os.path.join(output_directory, output_filename)
        h5file = h5py.File(output_path, mode='w')
        TMPDIR = tempfile.mkdtemp()

        # Every image has three channels (RGB) and variable height and width.
        # It features a variable number of bounding boxes that identify the
        # location and label of digits. The bounding box location is specified
        # using the x and y coordinates of its top left corner along with its
        # width and height.
        BoundingBoxes = namedtuple(
            'BoundingBoxes', ['labels', 'heights', 'widths', 'lefts', 'tops'])
        sources = ('features',) + tuple('bbox_{}'.format(field)
                                        for field in BoundingBoxes._fields)
        source_dtypes = dict([(source, 'uint8') for source in sources[:2]] +
                             [(source, 'uint16') for source in sources[2:]])
        source_axis_labels = {
            'features': ('channel', 'height', 'width'),
            'bbox_labels': ('bounding_box', 'index'),
            'bbox_heights': ('bounding_box', 'height'),
            'bbox_widths': ('bounding_box', 'width'),
            'bbox_lefts': ('bounding_box', 'x'),
            'bbox_tops': ('bounding_box', 'y')}

        # The dataset is split into three sets: the training set, the test set
        # and an extra set of examples that are somewhat less difficult but
        # can be used as extra training data. These sets are stored separately
        # as 'train.tar.gz', 'test.tar.gz' and 'extra.tar.gz'. Each file
        # contains a directory named after the split it stores. The examples
        # are stored in that directory as PNG images. The directory also
        # contains a 'digitStruct.mat' file with all the bounding box and
        # label information.
        splits = ('train', 'test', 'extra')
        file_paths = dict(zip(splits, FORMAT_1_FILES))
        for split, path in file_paths.items():
            file_paths[split] = os.path.join(directory, path)
        digit_struct_paths = dict(
            [(split, os.path.join(TMPDIR, split, 'digitStruct.mat'))
             for split in splits])

        # We first extract the data files in a temporary directory. While doing
        # that, we also count the number of examples for each split. Files are
        # extracted individually, which allows to display a progress bar. Since
        # the splits will be concatenated in the HDF5 file, we also compute the
        # start and stop intervals of each split within the concatenated array.
        def extract_tar(split):
            with tarfile.open(file_paths[split], 'r:gz') as f:
                members = f.getmembers()
                num_examples = sum(1 for m in members if '.png' in m.name)
                progress_bar_context = progress_bar(
                    name='{} file'.format(split), maxval=len(members),
                    prefix='Extracting')
                with progress_bar_context as bar:
                    for i, member in enumerate(members):
                        f.extract(member, path=TMPDIR)
                        bar.update(i)
            return num_examples

        examples_per_split = OrderedDict(
            [(split, extract_tar(split)) for split in splits])
        cumulative_num_examples = numpy.cumsum(
            [0] + list(examples_per_split.values()))
        num_examples = cumulative_num_examples[-1]
        intervals = zip(cumulative_num_examples[:-1],
                        cumulative_num_examples[1:])
        split_intervals = dict(zip(splits, intervals))

        # The start and stop indices are used to create a split dict that will
        # be parsed into the split array required by the H5PYDataset interface.
        # The split dict is organized as follows:
        #
        #     dict(split -> dict(source -> (start, stop)))
        #
        split_dict = OrderedDict([
            (split, OrderedDict([(s, split_intervals[split])
                                 for s in sources]))
            for split in splits])
        h5file.attrs['split'] = H5PYDataset.create_split_array(split_dict)

        # We then prepare the HDF5 dataset. This involves creating datasets to
        # store data sources and datasets to store auxiliary information
        # (namely the shapes for variable-length axes, and labels to indicate
        # what these variable-length axes represent).
        def make_vlen_dataset(source):
            # Create a variable-length 1D dataset
            dtype = h5py.special_dtype(vlen=numpy.dtype(source_dtypes[source]))
            dataset = h5file.create_dataset(
                source, (num_examples,), dtype=dtype)
            # Create a dataset to store variable-length shapes.
            axis_labels = source_axis_labels[source]
            dataset_shapes = h5file.create_dataset(
                '{}_shapes'.format(source), (num_examples, len(axis_labels)),
                dtype='uint16')
            # Create a dataset to store labels for variable-length axes.
            dataset_vlen_axis_labels = h5file.create_dataset(
                '{}_vlen_axis_labels'.format(source), (len(axis_labels),),
                dtype='S{}'.format(
                    numpy.max([len(label) for label in axis_labels])))
            # Fill variable-length axis labels
            dataset_vlen_axis_labels[...] = [
                label.encode('utf8') for label in axis_labels]
            # Attach auxiliary datasets as dimension scales of the
            # variable-length 1D dataset. This is in accordance with the
            # H5PYDataset interface.
            dataset.dims.create_scale(dataset_shapes, 'shapes')
            dataset.dims[0].attach_scale(dataset_shapes)
            dataset.dims.create_scale(dataset_vlen_axis_labels, 'shape_labels')
            dataset.dims[0].attach_scale(dataset_vlen_axis_labels)
            # Tag fixed-length axis with its label
            dataset.dims[0].label = 'batch'

        for source in sources:
            make_vlen_dataset(source)

        # The "fun" part begins: we extract the bounding box and label
        # information contained in 'digitStruct.mat'. This is a version 7.3
        # Matlab file, which uses HDF5 under the hood, albeit with a very
        # convoluted layout.
        def get_boxes(split):
            boxes = []
            with h5py.File(digit_struct_paths[split], 'r') as f:
                bar_name = '{} digitStruct'.format(split)
                bar_maxval = examples_per_split[split]
                with progress_bar(bar_name, bar_maxval) as bar:
                    for image_number in range(examples_per_split[split]):
                        # The 'digitStruct' group is the main group of the HDF5
                        # file. It contains two datasets: 'bbox' and 'name'.
                        # The 'name' dataset isn't of interest to us, as it
                        # stores file names and there's already a one-to-one
                        # mapping between row numbers and image names (e.g.
                        # row 0 corresponds to '1.png', row 1 corresponds to
                        # '2.png', and so on).
                        main_group = f['digitStruct']
                        # The 'bbox' dataset contains the bounding box and
                        # label information we're after. It has as many rows
                        # as there are images, and one column. Elements of the
                        # 'bbox' dataset are object references that point to
                        # (yet another) group that contains the information
                        # for the corresponding image.
                        image_reference = main_group['bbox'][image_number, 0]

                        # There are five datasets contained in that group:
                        # 'label', 'height', 'width', 'left' and 'top'. Each of
                        # those datasets has as many rows as there are bounding
                        # boxes in the corresponding image, and one column.
                        def get_dataset(name):
                            return main_group[image_reference][name][:, 0]
                        names = ('label', 'height', 'width', 'left', 'top')
                        datasets = dict(
                            [(name, get_dataset(name)) for name in names])

                        # If there is only one bounding box, the information is
                        # stored directly in the datasets. If there are
                        # multiple bounding boxes, elements of those datasets
                        # are object references pointing to 1x1 datasets that
                        # store the information (fortunately, it's the last
                        # hop we need to make).
                        def get_elements(dataset):
                            if len(dataset) > 1:
                                return [int(main_group[reference][0, 0])
                                        for reference in dataset]
                            else:
                                return [int(dataset[0])]
                        # Names are pluralized in the BoundingBox named tuple.
                        kwargs = dict(
                            [(name + 's', get_elements(dataset))
                             for name, dataset in iteritems(datasets)])
                        boxes.append(BoundingBoxes(**kwargs))
                        if bar:
                            bar.update(image_number)
            return boxes

        split_boxes = dict([(split, get_boxes(split)) for split in splits])

        # The final step is to fill the HDF5 file.
        def fill_split(split, bar=None):
            for image_number in range(examples_per_split[split]):
                image_path = os.path.join(
                    TMPDIR, split, '{}.png'.format(image_number + 1))
                image = numpy.asarray(
                    Image.open(image_path)).transpose(2, 0, 1)
                bounding_boxes = split_boxes[split][image_number]
                num_boxes = len(bounding_boxes.labels)
                index = image_number + split_intervals[split][0]

                h5file['features'][index] = image.flatten()
                h5file['features'].dims[0]['shapes'][index] = image.shape
                for field in BoundingBoxes._fields:
                    name = 'bbox_{}'.format(field)
                    h5file[name][index] = getattr(bounding_boxes, field)
                    h5file[name].dims[0]['shapes'][index] = [num_boxes, 1]

                # Replace label '10' with '0'.
                labels = h5file['bbox_labels'][index]
                labels[labels == 10] = 0
                h5file['bbox_labels'][index] = labels

                if image_number % 1000 == 0:
                    h5file.flush()
                if bar:
                    bar.update(index)

        with progress_bar('SVHN format 1', num_examples) as bar:
            for split in splits:
                fill_split(split, bar=bar)
    finally:
        if os.path.isdir(TMPDIR):
            shutil.rmtree(TMPDIR)
        h5file.flush()
        h5file.close()

    return (output_path,)
示例#48
0
def write_h5py_dataset(
    nested_dataset_dict,
    sources_dim_labels,
    file_path,
    dtype=config.floatX
):
    """ Creates a h5py file based dataset (writes this to disk).

    Parameters
    ----------
    nested_dataset_dict : {
        <subset>: {
            <source>: numpy.ndarray()
        }
    }, where
        <subset> : str
        <source> : str
    file_path : str
    sources_dim_labels = {
        <source> : [str]  # length of list = to number of dimensions of source
    }
    name : str

    Returns
    -------
    h5py_file_path
    """
    previous_end_idx = 0
    source_to_list_of_subsets = {}
    split_dict = {}
    for subset_name in nested_dataset_dict.keys():
        if subset_name not in split_dict:
                split_dict[subset_name] = {}
        n_samples = nested_dataset_dict[subset_name].values()[0].shape[0]
        for source_name in nested_dataset_dict[subset_name].keys():
            assert nested_dataset_dict[
                subset_name
            ][source_name].shape[0] == n_samples
            split_dict[subset_name][source_name] = (
                previous_end_idx,
                previous_end_idx + n_samples
            )
            if source_name not in source_to_list_of_subsets:
                source_to_list_of_subsets[source_name] = [
                    nested_dataset_dict[subset_name][source_name]
                ]
            else:
                source_to_list_of_subsets[source_name].append(
                    nested_dataset_dict[subset_name][source_name]
                )
        previous_end_idx += n_samples

    concatenated_subsets = {}
    for source_name in source_to_list_of_subsets.keys():
        concatenated_subsets[source_name] = np.concatenate(
            source_to_list_of_subsets[source_name],
            axis=0
        )

    def write_one_source(
        source_name,
        source_data,
        source_dim_labels,
        h5py_file
    ):
        """ Writes the content for one source to the passed H5PY File.

        Parameters
        ----------
        source_name : str
        source_data : ndarray(shape=S)
        source_dim_labels : [str]
            len(source_dim_labels) = len(S)
        h5py_file : h5py.File
        """
        source_handle = h5py_file.create_dataset(
            source_name,
            source_data.shape,
            dtype=dtype
        )
        source_handle[...] = source_data
        for dim, label in zip(source_handle.dims, source_dim_labels):
            dim.label = label

    with h5py.File(file_path, mode='w') as f:
        for source_name in concatenated_subsets.keys():
            write_one_source(
                source_name=source_name,
                source_data=concatenated_subsets[source_name],
                source_dim_labels=sources_dim_labels[source_name],
                h5py_file=f
            )
        f.attrs['split'] = H5PYDataset.create_split_array(split_dict)
        f.flush()
        f.close()
示例#49
0
f = h5py.File('../../data/dataset.hdf5', mode='w')
images = f.create_dataset('images', train_feature.shape, dtype='float32')
targets = f.create_dataset('targets', train_target.shape, dtype='float32')

images[...] = train_feature
targets[...] = train_target

split_dict = {
    'train': {
        'images': (0, train_feature.shape[0]),
        'targets': (0, train_target.shape[0])
    }
}

f.attrs['split'] = H5PYDataset.create_split_array(split_dict)

f.flush()
f.close()

train_set = H5PYDataset('../../data/dataset.hdf5', which_sets=('train',))
#data_stream = DataStream(dataset=train_set, iteration_scheme=scheme)

#state = train_set.open()
scheme = ShuffledScheme(examples=train_set.num_examples, batch_size=4)

data_stream = DataStream(dataset=train_set, iteration_scheme=scheme)
for data in data_stream.get_epoch_iterator():
    print(data[0], data[1])

standardized_stream = ScaleAndShift(data_stream=data_stream,
data_to_use = numpy.asarray(data_to_use)
input_array = data_to_use[:-frame_length].reshape(num_examples, seq_length, frame_length)
target_array = data_to_use[frame_length:].reshape(num_examples, seq_length, frame_length)
print input_array.shape
print target_array.shape

# Make H5PY file
print "\nMaking Fuel-formatted HDF5 file..."
f = h5py.File(hdf5_file, mode="w")
inputs = f.create_dataset("inputs", input_array.shape, dtype="float64")
targets = f.create_dataset("targets", target_array.shape, dtype="float64")
inputs[...] = input_array
targets[...] = target_array
inputs.dims[0].label = "batch"
inputs.dims[1].label = "sequence"
targets.dims[0].label = "batch"
targets.dims[1].label = "sequence"

# Split in test:train
print "doing train:test split (at " + str(train_samples) + ")"
num_train_examples = train_samples // example_length
split_dict = {
    "train": {"inputs": (0, num_train_examples), "targets": (0, num_train_examples)},
    "dev": {"inputs": (num_train_examples, num_examples), "targets": (num_train_examples, num_examples)},
}

f.attrs["split"] = H5PYDataset.create_split_array(split_dict)
f.flush()
f.close()
print "file should be made"