Пример #1
0
def h5_cmpd():
    h = h5py.File('Brain_ONED_'+str(seed_no)+'.h5', 'a')
    maping_ex = {}
    maping_in = {}
    for ii in range(len(Me.spiketimes)):
        if len(Me[ii]) != 0:
            dset = h.create_dataset('/data/events/excitatory/spikes/' + str(ii), data=Me[ii])
            dset.attrs.create('source', data='nrn_'+str(ii))
            maping_ex['nrn_'+str(ii)] = '/data/events/excitatory/spikes/' + str(ii)
    for ii in range(len(Mi.spiketimes)):
        if len(Mi[ii]) != 0:
            dset = h.create_dataset('/data/events/inhibitory/spikes/' + str(ii), data=Mi[ii])
            dset.attrs.create('source', data='nrn_'+str(ii))
            maping_in['nrn_'+str(ii)] = '/data/events/inhibitory/spikes/' + str(ii)
    sp_type = np.dtype([('name', h5py.special_dtype(vlen=str)),('reference', h5py.special_dtype(vlen=str))])
    m_ex = h.create_dataset('/map/events/excitatory/spikes', dtype=sp_type, shape=(len(maping_ex),))
    m_in = h.create_dataset('/map/events/inhibitory/spikes', dtype=sp_type, shape=(len(maping_in),))
    doh_ = 0
    for ii,jj in maping_ex.items():
        m_ex[doh_] = (ii, jj)
        doh_ += 1
    doh_ = 0
    for ii,jj in maping_in.items():
        m_in[doh_] = (ii, jj)
        doh_ += 1
    h.close()
Пример #2
0
    def _check_data(self, data):
        """Check that the data provided by the dataset is valid.

        It is valid when it can be stored in a HDF5 using h5py.

        :param numpy.ndarray data: Data associated to the dataset
        :raises TypeError: In the case the data is not valid.
        """
        if isinstance(data, (six.text_type, six.binary_type)):
            return

        chartype = data.dtype.char
        if chartype == "U":
            pass
        elif chartype == "O":
            d = h5py.special_dtype(vlen=data.dtype)
            if d is not None:
                return
            d = h5py.special_dtype(ref=data.dtype)
            if d is not None:
                return
        else:
            return

        msg = "Type of the dataset '%s' is not supported. Found '%s'."
        raise TypeError(msg % (self.name, data.dtype))
Пример #3
0
    def WriteSimpleTypes(self):
        if len(self.simpleTypes) == 0:
            return

        maxLenTypeName = self._getMaxLength([x.name for x in self.simpleTypes])
        maxLenQuantity = self._getMaxLength([x.quantity for x in self.simpleTypes])
        # maxLenUnit = self._getMaxLength([x.unit for x in self.simpleTypes])
        numpyDataType = numpy.dtype({'names': ['name', 'dataType',
                                              'quantity',
                                              'relativeQuantity',
                                              'description',
                                              'unitOrEnumerationRow',
                                              ],
                               'formats': ['S' + str(max(maxLenTypeName, 1)),
                                           h5py.special_dtype(enum=(numpy.uint8, DataType)),  # 'uint8',
                                          'S' + str(max(maxLenQuantity, 1)),
                                          h5py.special_dtype(enum=(numpy.uint8, {'false':0, 'true':1})),  # 'uint8',
                                          'S1',
                                          'int32']})
        dataset = self.description.create_dataset('SimpleTypes', (len(self.simpleTypes), 1), dtype=numpyDataType, maxshape=(len(self.simpleTypes), 1), compression='gzip')
        allData = []
        for simpleType in self.simpleTypes:
            allData.append((simpleType.name, simpleType.dataType,
                            simpleType.quantity,
                            simpleType.relativeQuantity,
                            '',
                            simpleType.unitOrEnumerationRow))
        dataset[:, 0] = allData
Пример #4
0
def read_digital_compound(filename, fieldname, tmax, all_pops, h):
    cell_range = [0,1000,1050,1140,1230,1320,1560,2360,2560,3060,3160,3260,3360,3460,3560]
    pop_names = ['pyrRS23','pyrFRB23','bask23','axax23','LTS23', 
                 'spinste14', 'tuftIB5', 'tuftRS5', 'nontuftRS6', 
                 'bask56', 'axax56', 'LTS56', 'TCR', 'nRT']
    arr = pd.read_csv(filename, sep='\t', names=['times','cells'])
    u_cells = arr.cells.unique()  #get names of cells that fired
    pop_cell_dict = {} #dict of dict
    for cell_name in u_cells:
        pop_idx = bisect.bisect(cell_range, cell_name) - 1
        if pop_idx in all_pops: #only those populations of interest
            pop_name = pop_names[pop_idx]
            try:
                pop_cell_dict[pop_name][cell_name] = arr[(arr.cells == cell_name) & (arr.times <= tmax)].times.values
            except KeyError:
                pop_cell_dict[pop_name] = {cell_name:arr[(arr.cells == cell_name) & (arr.times <= tmax)].times.values}
    sp_type = np.dtype([('unique_id', h5py.special_dtype(vlen=str)),('path', h5py.special_dtype(vlen=str))])
    for pop_name,cell_dicts in pop_cell_dict.iteritems(): #flush them into hdf5
        ii = '/data/events/'+pop_name+'/spike_'+fieldname+'/'
        for cell_name, cell_value in cell_dicts.iteritems():
            e_dset = h.create_dataset(ii+str(cell_name), dtype=np.float32, data=cell_value)
            e_dset.attrs.create('SOURCE', data=str(cell_name))
        e_mset = h.create_dataset('/map/events/'+pop_name+'/spike_'+fieldname, dtype=sp_type, shape=(len(cell_dicts),))
        for idx,cell_name in enumerate(cell_dicts.iterkeys()):
            e_mset[idx] = (str(cell_name), ii+'_'+str(cell_name))
        #attach_to_all_under(h, 'events/'+pop_name, e_mset)
            
    print 'Done', filename
    return h
Пример #5
0
    def _set_values_to_brick(self, brick_guid, brick_slice, values, value_slice=None):
        brick_file_path = os.path.join(self.brick_path, '{0}.hdf5'.format(brick_guid))
        log.trace('Brick slice to fill: %s', brick_slice)
        log.trace('Value slice to extract: %s', value_slice)

        # Create the HDF5 dataset that represents one brick
        bD = tuple(self.brick_domains[1])
        cD = self.brick_domains[2]
        if value_slice is not None:
            vals = values[value_slice]
        else:
            vals = values

        if values.ndim == 0 and len(values.shape) == 0 and np.iterable(vals): # Prevent single value strings from being iterated
            vals = [vals]

        # Check for object type
        data_type = self.dtype
        fv = self.fill_value

        # Check for object type
        if data_type == '|O8':
            if np.iterable(vals):
                vals = [pack(x) for x in vals]
            else:
                vals = pack(vals)

        if self.inline_data_writes:
            if data_type == '|O8':
                data_type = h5py.special_dtype(vlen=str)
            if 0 in cD or 1 in cD:
                cD = True
            with HDFLockingFile(brick_file_path, 'a') as f:
                # TODO: Due to usage concerns, currently locking chunking to "auto"
                f.require_dataset(brick_guid, shape=bD, dtype=data_type, chunks=None, fillvalue=fv)
                f[brick_guid][brick_slice] = vals
        else:
            work_key = brick_guid
            work = (brick_slice, vals)
            work_metrics = (brick_file_path, bD, cD, data_type, fv)
            log.trace('Work key: %s', work_key)
            log.trace('Work metrics: %s', work_metrics)
            log.trace('Work[0]: %s', work[0])

            # If the brick file doesn't exist, 'touch' it to make sure it's immediately available
            if not os.path.exists(brick_file_path):
                if data_type == '|O8':
                    data_type = h5py.special_dtype(vlen=str)
                if 0 in cD or 1 in cD:
                    cD = True
                with HDFLockingFile(brick_file_path, 'a') as f:
                    # TODO: Due to usage concerns, currently locking chunking to "auto"
                    f.require_dataset(brick_guid, shape=bD, dtype=data_type, chunks=None, fillvalue=fv)

            if self.auto_flush:
                # Immediately submit work to the dispatcher
                self.brick_dispatcher.put_work(work_key, work_metrics, work)
            else:
                # Queue the work for later flushing
                self._queue_work(work_key, work_metrics, work)
Пример #6
0
def PostHDF5 (p, post_data):
  """Post data using the hdf5 interface"""

  # Build the url and then create a hdf5 object
  url = 'http://{}/{}/{}/hdf5/{}/{},{}/{},{}/{},{}/'.format(SITE_HOST, p.token, ','.join(p.channels), p.resolution, *p.args)

  tmpfile = tempfile.NamedTemporaryFile ()
  fh5out = h5py.File ( tmpfile.name )
  for idx, channel_name in enumerate(p.channels):
    chan_grp = fh5out.create_group(channel_name)
    chan_grp.create_dataset("CUTOUT", tuple(post_data[idx,:].shape), post_data[idx,:].dtype, compression='gzip', data=post_data[idx,:])
    chan_grp.create_dataset("CHANNELTYPE", (1,), dtype=h5py.special_dtype(vlen=str), data=p.channel_type)
    chan_grp.create_dataset("DATATYPE", (1,), dtype=h5py.special_dtype(vlen=str), data=p.datatype)
  fh5out.close()
  tmpfile.seek(0)
  
  try:
    # Build a post request
    req = urllib2.Request(url, tmpfile.read())
    import time
    start = time.time()
    response = urllib2.urlopen(req)
    print time.time()-start
    tmpfile.close()
    return response
  except urllib2.HTTPError,e:
    return e
 def __setitem__(self,key,value):
     if key in self:
         del self[key]
     if type(value) == tuple:
         maxshape = (None,) + value[1:]
         if type(self.__backing) == dict:
             self.__backing[key] = np.ndarray(
                 value,
                 dtype='<U15')
         elif type(self.__backing) == h5py.File:
             dtype = (h5py.special_dtype(vlen=unicode)
                  if key.lower() in ['id','name'] else 'float32' )
             self.__backing.create_dataset(
                 key,
                 shape=value,
                 maxshape=maxshape,
                 dtype=dtype)
     elif type(value) == np.ndarray:
         if type(self.__backing) == dict:
             self.__backing[key] = value
         elif type(self.__backing) == h5py.File:
             dtype = (h5py.special_dtype(vlen=unicode)
                  if key.lower() in ['id','name'] else 'float32' )
             maxshape = (None,) + value.shape[1:]
             self.__backing.create_dataset(
                 key,
                 shape=value.shape,
                 maxshape=maxshape,
                 dtype=dtype)
             self.__backing[key][:] = value
     else: raise TypeError
Пример #8
0
def _main(args):
    voc_path = os.path.expanduser(args.path_to_voc)
    train_ids = get_ids(voc_path, train_set)
    val_ids = get_ids(voc_path, val_set)
    test_ids = get_ids(voc_path, test_set)
    train_ids_2007 = get_ids(voc_path, sets_from_2007)
    total_train_ids = len(train_ids) + len(train_ids_2007)

    # Create HDF5 dataset structure
    print('Creating HDF5 dataset structure.')
    fname = os.path.join(voc_path, 'pascal_voc_07_12.hdf5')
    voc_h5file = h5py.File(fname, 'w')
    uint8_dt = h5py.special_dtype(
        vlen=np.dtype('uint8'))  # variable length uint8
    vlen_int_dt = h5py.special_dtype(
        vlen=np.dtype(int))  # variable length default int
    train_group = voc_h5file.create_group('train')
    val_group = voc_h5file.create_group('val')
    test_group = voc_h5file.create_group('test')

    # store class list for reference class ids as csv fixed-length numpy string
    voc_h5file.attrs['classes'] = np.string_(str.join(',', classes))

    # store images as variable length uint8 arrays
    train_images = train_group.create_dataset(
        'images', shape=(total_train_ids, ), dtype=uint8_dt)
    val_images = val_group.create_dataset(
        'images', shape=(len(val_ids), ), dtype=uint8_dt)
    test_images = test_group.create_dataset(
        'images', shape=(len(test_ids), ), dtype=uint8_dt)

    # store boxes as class_id, xmin, ymin, xmax, ymax
    train_boxes = train_group.create_dataset(
        'boxes', shape=(total_train_ids, ), dtype=vlen_int_dt)
    val_boxes = val_group.create_dataset(
        'boxes', shape=(len(val_ids), ), dtype=vlen_int_dt)
    test_boxes = test_group.create_dataset(
        'boxes', shape=(len(test_ids), ), dtype=vlen_int_dt)

    # process all ids and add to datasets
    print('Processing Pascal VOC 2007 datasets for training set.')
    last_2007 = add_to_dataset(voc_path, '2007', train_ids_2007, train_images,
                               train_boxes)
    print('Processing Pascal VOC 2012 training set.')
    add_to_dataset(
        voc_path,
        '2012',
        train_ids,
        train_images,
        train_boxes,
        start=last_2007 + 1)
    print('Processing Pascal VOC 2012 val set.')
    add_to_dataset(voc_path, '2012', val_ids, val_images, val_boxes)
    print('Processing Pascal VOC 2007 test set.')
    add_to_dataset(voc_path, '2007', test_ids, test_images, test_boxes)

    print('Closing HDF5 file.')
    voc_h5file.close()
    print('Done.')
Пример #9
0
def h5ProjInfo ( proj, h5f ):
  """Populate the HDF5 file with project attributes"""
  
  projgrp = h5f.create_group ( 'PROJECT' )
  projgrp.create_dataset("NAME", (1,), dtype=h5py.special_dtype(vlen=str), data=proj.project_name)
  projgrp.create_dataset("HOST", (1,), dtype=h5py.special_dtype(vlen=str), data=proj.host)
  projgrp.create_dataset("ND_VERSION", (1,), dtype=h5py.special_dtype(vlen=str), data=proj.nd_version)
  projgrp.create_dataset("SCHEMA_VERSION", (1,), dtype=h5py.special_dtype(vlen=str), data=proj.schema_version)
Пример #10
0
def h5ProjInfo ( proj, h5f ):
  """Populate the HDF5 file with project attributes"""
  
  projgrp = h5f.create_group ( 'PROJECT' )
  projgrp.create_dataset("NAME", (1,), dtype=h5py.special_dtype(vlen=str), data=proj.getProjectName())
  projgrp.create_dataset("HOST", (1,), dtype=h5py.special_dtype(vlen=str), data=proj.getDBHost())
  projgrp.create_dataset("OCP_VERSION", (1,), dtype=h5py.special_dtype(vlen=str), data=proj.getOCPVersion())
  projgrp.create_dataset("SCHEMA_VERSION", (1,), dtype=h5py.special_dtype(vlen=str), data=proj.getSchemaVersion())
Пример #11
0
 def testVlenReferenceDataItem(self):
     ref_dt = special_dtype(ref=Reference)
     dt = special_dtype(vlen=ref_dt)
     typeItem = hdf5dtype.getTypeItem(dt)
     typeSize = hdf5dtype.getItemSize(typeItem)
     self.assertEqual(typeItem['class'], 'H5T_VLEN')
     self.assertEqual(typeItem['size'], 'H5T_VARIABLE')
     baseItem = typeItem['base']
     self.assertEqual(baseItem['base'], 'H5T_STD_REF_OBJ')
     self.assertEqual(typeSize, 'H5T_VARIABLE')
 def input_data(self, value):
     if 'input_data' in self.h5group.keys():
         self.h5group['input_data'] = value
     else:
         if isinstance(value,h5py.h5r.Reference):
             self.h5group.create_dataset('input_data',  data=value, dtype=h5py.special_dtype(ref=h5py.Reference))
         else:
             print value
             dset=self.h5group.create_dataset('input_data', (len(value),), dtype=h5py.special_dtype(ref=h5py.Reference))
             for i,v in enumerate(value):
                 dset[i]=v
Пример #13
0
def h5ProjInfo ( proj, h5f ):
  """Populate the HDF5 file with project attributes"""
  projgrp = h5f.create_group ( 'PROJECT' )
  projgrp.create_dataset ( "NAME", (1,), dtype=h5py.special_dtype(vlen=str), data=proj._dbname )
  projgrp.create_dataset ( "HOST", (1,), dtype=h5py.special_dtype(vlen=str), data=proj._dbhost )
  projgrp.create_dataset ( "TYPE", (1,), dtype=np.uint32, data=proj._dbtype )
  projgrp.create_dataset ( "DATASET", (1,), dtype=h5py.special_dtype(vlen=str), data=proj._dataset )
  projgrp.create_dataset ( "DATAURL", (1,), dtype=h5py.special_dtype(vlen=str), data=proj._dataurl )
  projgrp.create_dataset ( "READONLY", (1,), dtype=bool, data=(False if proj._readonly==0 else True))
  projgrp.create_dataset ( "EXCEPTIONS", (1,), dtype=bool, data=(False if proj._exceptions==0 else True))
  projgrp.create_dataset ( "RESOLUTION", (1,), dtype=np.uint8, data=proj._resolution)
Пример #14
0
        def walk(dd, df):
            for key, value in dd.iteritems():
                if isinstance(value, dict):
                    try:
                        dset = df[key]
                    except:
                        dset = df.require_group(key)
                    walk(value, dset)
                else:
                    if (type(value) is np.float) or \
                       (type(value) is np.int):
                        try:
                            dset = df[key]
                        except KeyError:
                            dset = df.require_dataset(
                                key,
                                (0, 1),
                                type(value),
                                maxshape=(None, 1),
                                compression='lzf')
                        dset.resize(dset.shape[0]+1, axis=0)
                        dset[-1, 0] = value

                    if (type(value) is np.str):
                        try:
                            dset = df[key]
                        except KeyError:
                            dt = h5py.special_dtype(vlen=unicode)
                            dset = df.require_dataset(
                                key,
                                (0, 1),
                                dt,
                                maxshape=(None, 1),
                                compression='lzf')
                        dset.resize(dset.shape[0]+1, axis=0)
                        dset[-1, 0] = value

                    if type(value) is np.ndarray:
                        if type(value[0]) is np.string_:
                            dt = h5py.special_dtype(vlen=unicode)
                        else:
                            dt = np.float
                        try:
                            dset = df[key]
                        except KeyError:
                            dset = df.require_dataset(
                                key,
                                (0,)+value.shape,
                                dt,
                                maxshape=(None,)+value.shape,
                                compression='lzf')
                        dset.resize(dset.shape[0]+1, axis=0)
                        dset[-1, ...] = value
Пример #15
0
def copyAttributes(inDs, outDs):
    for k in inDs.attrs.keys():
        logging.debug("copying attribute: %s" % k)
        elt = inDs.attrs[k]
        if isinstance(elt, basestring):
            # h5py wants to simplify things down, so I think that this
            # is a possibility.
            newDtype = H5.special_dtype(vlen = str)
        elif elt.dtype == 'object':
            # this has to do with a numpy problem.
            newDtype = H5.special_dtype(vlen = str)
        else:
            newDtype = elt.dtype
            
        outDs.attrs.create(k, inDs.attrs[k], dtype = newDtype)
Пример #16
0
 def WriteEnumerations(self):
     if len(self.enumerations) == 0:
         return
     
     numpyDataType = numpy.dtype({'names': ['name', 'value',
                                           'description', 'firstEntry'],
                            'formats': [h5py.special_dtype(vlen=unicode),#'S' + str(max(maxLenName, 1)),
                                       'int32',
                                       h5py.special_dtype(vlen=unicode),#'S' + str(max(maxLenDescription, 1)),
                                        h5py.special_dtype(enum=(numpy.uint8, {'false':0, 'true':1}))]})  # 'uint8']})
     dataset = self.description.create_dataset('Enumerations', (len(self.enumerations), 1), dtype=numpyDataType, maxshape=(len(self.enumerations), 1), compression='gzip')
     allData = []
     for enum in self.enumerations:
         allData.append((enum.name, enum.value, enum.description, enum.firstEntry))
     dataset[:, 0] = allData
Пример #17
0
 def parse_structure(key, group, value, _type, **kwds):
     try:
         # Here we check if there are any signals in the container, as
         # casting a long list of signals to a numpy array takes a very long
         # time. So we check if there are any, and save numpy the trouble
         if np.any([isinstance(t, BaseSignal) for t in value]):
             tmp = np.array([[0]])
         else:
             tmp = np.array(value)
     except ValueError:
         tmp = np.array([[0]])
     if tmp.dtype is np.dtype('O') or tmp.ndim is not 1:
         dict2hdfgroup(dict(zip(
             [str(i) for i in range(len(value))], value)),
             group.create_group(_type + str(len(value)) + '_' + key),
             **kwds)
     elif tmp.dtype.type is np.unicode_:
         group.create_dataset(_type + key,
                              tmp.shape,
                              dtype=h5py.special_dtype(vlen=str),
                              **kwds)
         group[_type + key][:] = tmp[:]
     else:
         group.create_dataset(
             _type + key,
             data=tmp,
             **kwds)
Пример #18
0
    def _save_hdf5_v2(self, filename, group = "Twiss"):
        # data type
        dt = np.dtype( [ 
            ('element', h5py.special_dtype(vlen=bytes)),
            ('s',      np.float64),
            ('alphax', np.float64),
            ('alphay', np.float64),
            ('betax',  np.float64),
            ('betay',  np.float64),
            ('etax',   np.float64),
            ('etaxp',  np.float64),
            ('etay',   np.float64),
            ('etayp',  np.float64),
            ('phix',   np.float64),
            ('phiy',   np.float64),
            ] )

        data = np.ndarray((len(self.element),), dtype=dt)
        data['element'] = self.element
        for i,k in enumerate(self._cols):
            data[k] = [v[i] for v in self._twtable]

        f = h5py.File(filename)
        grp = f.create_group(group)
        grp['twtable'] = data
        grp['tune'] = np.array(self.tune)
        grp['chrom'] = np.array(self.chrom)
        grp['alphac'] = self.alphac
        f.close()
Пример #19
0
 def test_vlen_bytes(self):
     """ Vlen bytes dataset maps to vlen ascii in the file """
     dt = h5py.special_dtype(vlen=bytes)
     ds = self.f.create_dataset('x', (100,), dtype=dt)
     tid = ds.id.get_type()
     self.assertEqual(type(tid), h5py.h5t.TypeStringID)
     self.assertEqual(tid.get_cset(), h5py.h5t.CSET_ASCII)
Пример #20
0
 def test_create(self):
     """ Enum datasets can be created and type correctly round-trips """
     dt = h5py.special_dtype(enum=('i', self.EDICT))
     ds = self.f.create_dataset('x', (100,100), dtype=dt)
     dt2 = ds.dtype
     dict2 = h5py.check_dtype(enum=dt2)
     self.assertEqual(dict2,self.EDICT)
Пример #21
0
 def test_vlen_unicode(self):
     """ Vlen unicode dataset maps to vlen utf-8 in the file """
     dt = h5py.special_dtype(vlen=unicode)
     ds = self.f.create_dataset('x', (100,), dtype=dt)
     tid = ds.id.get_type()
     self.assertEqual(type(tid), h5py.h5t.TypeStringID)
     self.assertEqual(tid.get_cset(), h5py.h5t.CSET_UTF8)
Пример #22
0
    def __init__(self, output_name, output_dir, num_files, patches, feature_type,
                 patch_dim=128, patch_type='uint8', pos_type='uint16'):
        self.log = get_logger()

        output_subdir = output_dir
        try:
            makedirs(output_subdir)
        except:
            pass

        output_filename = join(output_subdir, basename(output_name))
        self.log.debug('Saving extracted descriptors to %s', output_filename)

        self.mode = 'creating'
        dt = special_dtype(vlen=bytes)
        patches += 10 #for safety
        self.hfile = HDF5File(output_filename, 'w', compression='gzip', fillvalue=0.0)
        self.patches = self.hfile.create_dataset('patches', (num_files * patches, patch_dim), dtype=patch_type, chunks=True)
        self.positions = self.hfile.create_dataset('positions', (num_files * patches, 2), dtype=pos_type, chunks=True)
        self.image_index = self.hfile.create_dataset('image_index', (num_files, 2), dtype='uint64') # Start, End positions of an image
        self.keys = self.hfile.create_dataset('keys', (num_files, ), dtype=dt)
        self.key_set = set()
        self.patches.attrs['cursor'] = 0
        self.patches.attrs['feature_type'] = feature_type

        self.output_filename = output_filename
Пример #23
0
 def make_vlen_dataset(source):
     # Create a variable-length 1D dataset
     dtype = h5py.special_dtype(vlen=numpy.dtype(source_dtypes[source]))
     dataset = h5file.create_dataset(
         source, (num_examples,), dtype=dtype)
     # Create a dataset to store variable-length shapes.
     axis_labels = source_axis_labels[source]
     dataset_shapes = h5file.create_dataset(
         '{}_shapes'.format(source), (num_examples, len(axis_labels)),
         dtype='uint16')
     # Create a dataset to store labels for variable-length axes.
     dataset_vlen_axis_labels = h5file.create_dataset(
         '{}_vlen_axis_labels'.format(source), (len(axis_labels),),
         dtype='S{}'.format(
             numpy.max([len(label) for label in axis_labels])))
     # Fill variable-length axis labels
     dataset_vlen_axis_labels[...] = [
         label.encode('utf8') for label in axis_labels]
     # Attach auxiliary datasets as dimension scales of the
     # variable-length 1D dataset. This is in accordance with the
     # H5PYDataset interface.
     dataset.dims.create_scale(dataset_shapes, 'shapes')
     dataset.dims[0].attach_scale(dataset_shapes)
     dataset.dims.create_scale(dataset_vlen_axis_labels, 'shape_labels')
     dataset.dims[0].attach_scale(dataset_vlen_axis_labels)
     # Tag fixed-length axis with its label
     dataset.dims[0].label = 'batch'
Пример #24
0
def dump_unicode(obj, h5f, compression=None):
    """ dumps a list object to h5py file"""
    dt = h5.special_dtype(vlen=unicode)
    ll = len(obj)
    dset = h5f.create_dataset('data', shape=(ll, ), compression=compression, dtype=dt)
    dset[:ll] = obj
    h5f.create_dataset('type', data=['unicode'])
Пример #25
0
    def _createDatasetInFile(self, hdf5File, datasetName, roi):
        shape = tuple(roi[1] - roi[0])
        chunks = self._description.chunks
        if chunks is not None:
            # chunks must not be bigger than the data in any dim
            chunks = numpy.minimum(chunks, shape)
            chunks = tuple(chunks)
        compression = self._description.compression
        compression_opts = self._description.compression_opts

        dtype = self._description.dtype
        if dtype == object:
            dtype = h5py.special_dtype(vlen=numpy.uint8)
        dataset = hdf5File.create_dataset(
            datasetName,
            shape=shape,
            dtype=dtype,
            chunks=chunks,
            compression=compression,
            compression_opts=compression_opts,
        )

        # Set data attributes
        if self._description.drange is not None:
            dataset.attrs["drange"] = self._description.drange
        if _use_vigra:
            dataset.attrs["axistags"] = vigra.defaultAxistags(str(self._description.axes)).toJSON()
Пример #26
0
def get_attribute_types(fname):
    if not h5py.is_hdf5(fname):
        return ""

    types=set()
    dt = h5py.special_dtype(vlen=str)
    try:
        h5 = h5py.File(fname, 'r')
        have_type = '/data_descr/types' in h5
        all_types = set(h5['/data_descr/types'])
        for o in h5['/data_descr/ordering']:
            indptr_name='/data/' + o + '_indptr'
            indices_name='/data/' + o + '_indices'
            if indptr_name in h5 and indices_name in h5:
                types += 'Sparse Matrix'
            else:
                if have_type and o in all_types:
                    types += h5['/data_descr/types'][o]
                else:
                    t=h5['/data/' + o].dtype
                    if t==dt:
                        types.add("String")
                    elif t in (numpy.int64, numpy.int32):
                        types.add("Integer")
                    elif t in (numpy.float64, numpy.float32):
                        types.add("Floating Point")
                    else:
                        types.add(str(t))
        h5.close()
    except:
        pass

    return ','.join(list(types))
Пример #27
0
    def test_create_array_string(self):
        file_name = get_temp_file()

        with h5py.File(file_name) as loc:

            shape = (32,)
            a = np.zeros(shape, dtype=np.dtype('|S3'))
            path = '/string/32'
            msg = create_array(loc, path, a)
            self.assertEqual(msg, path)
            
            shape = (32,1)
            a = np.zeros(shape, dtype=np.dtype('|U2'))
            path = '/string/32 x 1'
            msg = create_array(loc, path, a)
            self.assertEqual(msg, path)

            shape = (1, 32)
            a = np.zeros(shape, dtype=h5py.special_dtype(vlen=str))
            path = '/string/1 x 32'
            msg = create_array(loc, path, a)
            self.assertEqual(msg, path)

            shape = (8, 16)
            a = np.zeros(shape, dtype=np.dtype('|S7'))
            path = '/float/8 x 16'
            msg = create_array(loc, path, a)
            self.assertEqual(msg, path)

            shape = (8, 16, 4)
            a = np.zeros(shape, dtype=np.dtype('|U8'))
            path = '/float/8 x 16 x 4'
            msg = create_array(loc, path, a)
            self.assertEqual(msg, path)
Пример #28
0
def _write_arrays(group, name, data, parent=None):
    grefs = group.create_group('_refs_{}'.format(name))
    ref_dtype = _h5.special_dtype(ref=_h5.Reference)
    dname = group.create_dataset(name, (_np.size(data),), dtype=ref_dtype)
    # ======================================
    # Create datasets
    # ======================================
    for i, array in enumerate(data):
        if array.dtype == _np.dtype(object):
            # ======================================
            # If dataset can't be created, nest
            # ======================================
            darray = _write_arrays(grefs, '{}'.format(i), array, parent=name)
        else:
            darray = grefs.create_dataset(name='{}'.format(i), data=array, shape=_np.shape(array), compression="gzip")

        # ======================================
        # Store reference in dataset
        # ======================================
        dname[i] = darray.ref

    # if parent == 'hist':
    #     pdb.set_trace()

    # ======================================
    # Return created dataset
    # ======================================
    return dname
Пример #29
0
def dump_unicode(obj, h5f, **kwargs):
    """ dumps a list object to h5py file"""
    dt = h5.special_dtype(vlen=unicode)
    ll = len(obj)
    dset = h5f.create_dataset('data', shape=(ll, ), dtype=dt, **kwargs)
    dset[:ll] = obj
    h5f.create_dataset('type', data=['unicode'])
Пример #30
0
    def get_state(self, state):
        """Saves the vocabulary in a network state file.

        If there already is a vocabulary in the state, it will be replaced, so
        it has to have the same number of words.

        :type state: h5py.File
        :param state: HDF5 file for storing the neural network parameters
        """

        h5_vocabulary = state.require_group('vocabulary')

        if 'words' in h5_vocabulary:
            state['words'][:] = self.id_to_word
        else:
            str_dtype = h5py.special_dtype(vlen=str)
            h5_vocabulary.create_dataset('words',
                                         data=self.id_to_word,
                                         dtype=str_dtype)

        if 'classes' in h5_vocabulary:
            state['classes'][:] = self.word_id_to_class_id
        else:
            h5_vocabulary.create_dataset('classes', data=self.word_id_to_class_id)

        probs = [self._word_classes[class_id].get_prob(word_id)
                 for word_id, class_id in enumerate(self.word_id_to_class_id)]
        if 'probs' in h5_vocabulary:
            state['probs'][:] = probs
        else:
            h5_vocabulary.create_dataset('probs', data=probs)
Пример #31
0
def CifarAnalysis(folderName=None, batchsize=1000, **kwd):
    id_gpu = 0

    OutStr = ""
    OutStr += 'GPU: {}\n'.format(id_gpu)
    OutStr += 'Minibatch-size: {}\n'.format(batchsize)
    OutStr += 'kwd: {}\n'.format(kwd)
    OutStr += ''
    print OutStr

    fOutput = None
    if folderName:
        if not os.path.exists(folderName):
            os.makedirs(folderName)
        fOutput = open(os.path.join(folderName, "output.dat"), "w")
        shutil.copyfile(__file__,
                        os.path.join(folderName, os.path.basename(__file__)))

# Prepare dataset
    data_tr = np.zeros((50000, 3 * 32 * 32), dtype=np.float32)
    data_ev = np.zeros((10000, 3 * 32 * 32), dtype=np.float32)
    label_tr = np.zeros((50000), dtype=np.int32)
    label_ev = np.zeros((10000), dtype=np.int32)
    I_colors = 3
    I_Xunit = 32
    I_Yunit = 32
    F_unit = 100  # be careful!!

    h5f_tr = h5py.File("data_cifar100/train.h5f", "r")
    data_tr[:] = h5f_tr["ZCA_byTrainData/data"].value
    label_tr[:] = h5f_tr["Info/fine_labels"].value

    h5f_ev = h5py.File("data_cifar100/test.h5f", "r")
    data_ev[:] = h5f_ev["ZCA_byTrainData/data"].value
    label_ev[:] = h5f_ev["Info/fine_labels"].value

    ## Prep
    x_tr = data_tr.reshape((len(data_tr), 3, 32, 32))
    x_ev = data_ev.reshape((len(data_ev), 3, 32, 32))
    y_tr = label_tr
    y_ev = label_ev
    N_tr = len(data_tr)  # 50000
    N_ev = len(data_ev)  # 10000

    ag = Augument.Augumentation()

    ## Define analisis
    Resume = None
    if "Resume" in kwd:
        Resume = kwd["Resume"]
        del kwd["Resume"]

    model, ModelKwd = net.GenModel(I_colors=I_colors,
                                   I_Xunit=I_Xunit,
                                   I_Yunit=I_Yunit,
                                   F_unit=F_unit,
                                   **kwd)
    if id_gpu >= 0:
        cuda.get_device(id_gpu).use()
        model.to_gpu()
    xp = np if id_gpu < 0 else cuda.cupy

    # Setup optimizer
    optimizer = optimizers.Adam()
    optimizer.setup(model)

    # Init/Resume
    if Resume:
        print 'Load optimizer state from %s' % (Resume)
        with h5py.File(Resume, "r") as f:
            s = HDF5Deserializer(f)

            s_model = s["model"]
            s_model.load(model)


# Setup stop manager
    sm = StopManager.StopManager()
    sm.SetMaximumEpoch(10000)
    sm.SetMinimumEpoch(10)
    sm.SetStopThreshold(3e-4)
    print sm

    # Learning loop
    if fOutput: fOutput.write("epoch,mode,loss,accuracy\n")
    #for epoch in six.moves.range(1, n_epoch + 1):
    epoch = 0
    while True:
        epoch += 1
        print 'epoch %d' % epoch

        # training
        perm = np.random.permutation(N_tr)
        sum_accuracy = 0
        sum_loss = 0
        start = time.time()
        for i in six.moves.range(0, N_tr, batchsize):
            bx = x_tr[perm[i:i + batchsize]]
            #if epoch>10: bx = ag.Aug(bx)
            #print bx[0]
            #bx = ag.Aug(bx)
            #print bx[0]
            #raw_input()
            x = chainer.Variable(xp.asarray(bx))
            t = chainer.Variable(xp.asarray(y_tr[perm[i:i + batchsize]]))

            # Pass the loss function (Classifier defines it) and its arguments
            model.predictor.setTrainMode(True)
            optimizer.update(model, x, t)

            if (epoch == 1 and i == 0) and folderName:
                with open(os.path.join(folderName, 'graph.dot'), 'w') as o:
                    g = computational_graph.build_computational_graph(
                        (model.loss, ))
                    o.write(g.dump())
                print 'graph generated'

            sum_loss += float(model.loss.data) * len(t.data)
            sum_accuracy += float(model.accuracy.data) * len(t.data)
        end = time.time()
        elapsed_time = end - start
        throughput = N_tr / elapsed_time
        print 'train mean loss=%.5f, accuracy=%.2f%%, throughput=%.0f images/sec' % (
            sum_loss / N_tr, sum_accuracy / N_tr * 100., throughput)

        if fOutput:
            fOutput.write("%d,Train,%e,%e\n" %
                          (epoch, sum_loss / N_tr, sum_accuracy / N_tr))

        # evaluation
        perm = np.random.permutation(N_ev)
        sum_accuracy = 0
        sum_loss = 0
        for i in six.moves.range(0, N_ev, batchsize):
            x = chainer.Variable(xp.asarray(x_ev[perm[i:i + batchsize]]),
                                 volatile='on')
            t = chainer.Variable(xp.asarray(y_ev[perm[i:i + batchsize]]),
                                 volatile='on')
            model.predictor.setTrainMode(False)
            loss = model(x, t)
            sum_loss += float(loss.data) * len(t.data)
            sum_accuracy += float(model.accuracy.data) * len(t.data)
        print 'test  mean loss=%.5f, accuracy=%.2f%%' % (
            sum_loss / N_ev,
            sum_accuracy / N_ev * 100,
        )
        sm.AddAccuracy(sum_accuracy / N_ev)
        print sm.GetInfo()
        if fOutput:
            fOutput.write("%d,Test,%e,%e\n" %
                          (epoch, sum_loss / N_ev, sum_accuracy / N_ev))

        StopFlag = sm.StopCheck()

        if folderName and (epoch % 1 == 0 or StopFlag):
            # Save the model and the optimizer
            if StopFlag:
                myFname = os.path.join(folderName, 'mlp_final')
            else:
                myFname = os.path.join(folderName, 'mlp_%d' % epoch)

            with h5py.File(myFname + ".hdf5", "w") as f:
                s = HDF5Serializer(f)
                s["model"].save(model)
                f.create_dataset("kwd",
                                 data=ModelKwd.__str__(),
                                 dtype=h5py.special_dtype(vlen=unicode))
                f.create_dataset("net",
                                 data=netFile,
                                 dtype=h5py.special_dtype(vlen=unicode))
                f.flush()

        if StopFlag: break

    if fOutput: fOutput.close()
Пример #32
0
def processNMostCommon(N=3,
                       wavdirpath=PATH_TRAIN_IN_16KWAVS,
                       xmlpicklepath=PATH_TEST_OUT_XMLPICKLEFILE,
                       todirrootpath=PATH_TEST_OUT_HDF5):
    global spectrogramWindowLength

    if not os.path.exists(todirrootpath):
        os.makedirs(todirrootpath)

    spectrogramHeight = 200

    f = h5py.File(
        os.path.join(todirrootpath, "data_top{}_nozero.hdf5".format(N)), "w")
    dsetX = f.create_dataset(
        'X', (0, 1, spectrogramHeight, spectrogramWindowLength),
        maxshape=(None, 1, spectrogramHeight, spectrogramWindowLength))
    dsety = f.create_dataset('y', (0, N), maxshape=(None, N))
    dsetMediaId = f.create_dataset('MediaId', (0, 1), maxshape=(None, 1))
    dsetClassId = f.create_dataset('ClassId', (0, 1),
                                   maxshape=(None, 1),
                                   dtype=h5py.special_dtype(vlen=unicode))

    import pickle
    df = pd.read_pickle(xmlpicklepath)  # read the metadata

    # if we would like to keep recordings with a given quality than we can do it here by uncommenting the next line
    #df = filterByQuality(df, 0, 3)

    df["OFGS"] = df.apply(mergeOFGS,
                          axis=1)  # merge Order, Family, Genus, Species
    df_mc = getMostCommon(df, N)  # get N most common classes from the dataset
    df = None  # let GC free up some memory
    print("Metadata loaded")

    # Shuffle rows
    df_mc = df_mc.iloc[np.random.permutation(len(df_mc))]
    df_mc.reset_index(drop=True, inplace=True)
    (lb, binaryLabels) = getOneHotClassId(df_mc)  # generate one-hot labels
    pickle.dump(
        lb,
        open(
            os.path.join(todirrootpath,
                         "labelBinarizer_top{}.pickle".format(N)), 'wb'))

    # process the selected files of top N classes and save the data into HDF5
    fileRanges = np.hstack((np.arange(0, len(df_mc), 30), len(df_mc)))
    for i in range(len(fileRanges) - 1):
        tempSG = wavsToSpectrogramByList(
            wavdirpath,
            df_mc.FileName[fileRanges[i]:fileRanges[i + 1]],
            dontFilter=False)
        X, y, fn, cIds = spectrogramListToT4(tempSG, \
      binaryLabels[fileRanges[i]: fileRanges[i+1]], \
      filenames = df_mc.MediaId[fileRanges[i]: fileRanges[i+1]].values, N=spectrogramWindowLength, \
      classIds = df_mc.ClassId[fileRanges[i]: fileRanges[i+1]].values) #convert to t4
        pre_len = dsetX.shape[0]
        add_len = X.shape[0]
        dsetX.resize(pre_len + add_len, axis=0)
        dsety.resize(pre_len + add_len, axis=0)
        dsetMediaId.resize(pre_len + add_len, axis=0)
        dsetClassId.resize(pre_len + add_len, axis=0)
        dsetX[pre_len:pre_len + add_len, :, :, :] = X
        dsety[pre_len:pre_len + add_len, :] = y
        dsetMediaId[pre_len:pre_len + add_len, :] = np.transpose(
            [[int(i) for i in fn]])
        dsetClassId[pre_len:pre_len + add_len, :] = np.transpose(
            [[s.encode('utf8') for s in cIds]])
        f.flush()

    f.close
    return (X, y, fn)  # return last batch for debug purposes
Пример #33
0
    #print(token_ids, len(token_ids))
    tokens_tensor = torch.tensor([token_ids])
    token_type_tensor = torch.LongTensor([[0] * len(tokens_a_delim)])
    #print(token_type_tensor)
    _, _, attn_data_list = model(tokens_tensor,
                                 token_type_ids=token_type_tensor)
    attn_tensor = torch.stack(
        [attn_data['attn_probs'] for attn_data in attn_data_list])
    attention[sent] = attn_tensor.data.numpy()

L = len(sentences)

sent_id = []
attentions = []
for idx in attention:
    sent_id.append(idx)
    attentions.append(attention[idx])

f = h5py.File('attn.h5', 'w')
dt = h5py.special_dtype(vlen=np.dtype('float64', 'float64'))
dataset = f.create_dataset('vlen', (
    L,
    12,
    1,
    12,
), dtype=dt)
dataset.value
for i in range(len(attentions)):
    dataset[i] = attentions[i]
dataset.value
f.close()
def create_target(df, prefix):
    print(f'There are {len(df)} bounding boxes matched in {prefix}...')

    # setting path to data
    datapath = os.path.join(modelpath, f"{prefix}_data_300_vgg.h5")

    # sorting all df
    df = df.sort_values('ImageID')

    # open file and maintain it opened
    f = h5py.File(datapath, 'w')

    try:
        # get the first image
        img = df.iloc[0]

        # setting initial states to iterate over all dataframe
        img_name = img[0]
        img_path = img[7]
        images = []
        target = [img[13:].tolist() + img[9:13].tolist()]

        # iterate over all data set
        for i, img in tqdm(enumerate(df.iloc[:, :].itertuples())):
            # in first iteration of each batch size create the group
            # save last image when the new one is new
            if img_name != img[1]:
                images.append([
                    img_name.encode("ascii", "ignore"),
                    img_path.encode("ascii", "ignore")
                ])

                # create a dataset with the position and classification
                f.create_dataset(name=img_name,
                                 data=target,
                                 dtype=np.float32,
                                 compression='gzip',
                                 compression_opts=4)

                # clean all states
                target = []
                img_name = img[1]
                img_path = img[8]

            target.append(list(img[14:] + img[10:14]))

        f.create_dataset(name=img_name,
                         data=target[0],
                         dtype=np.float32,
                         compression='gzip',
                         compression_opts=4)

        f.create_dataset(name='images',
                         shape=(len(images), 2),
                         data=images,
                         dtype=h5py.special_dtype(vlen=str),
                         compression='gzip',
                         compression_opts=4)

    finally:
        f.close()
Пример #35
0
def savetoqmcpack(cell, mf, title="Default", kpts=[]):
    import h5py, re
    from collections import defaultdict
    from pyscf.pbc import gto, scf, df, dft

    PBC = False
    UnRestricted = False
    Complex = False

    val = str(mf)
    ComputeMode = re.split('[. ]', val)

    SizeMode = len(ComputeMode)
    for i in range(SizeMode):
        if ComputeMode[i] in ("UHF", "KUHF", "UKS"):
            UnRestricted = True
        if ComputeMode[i] == "pbc":
            PBC = True

    if PBC and len(kpts) == 0:
        sys.exit(
            "You need to specify explicit the list of K-point (including gamma)"
        )

    IonName = dict([('H', 1), ('He', 2), ('Li', 3), ('Be', 4), ('B', 5),
                    ('C', 6), ('N', 7), ('O', 8), ('F', 9), ('Ne', 10),
                    ('Na', 11), ('Mg', 12), ('Al', 13), ('Si', 14), ('P', 15),
                    ('S', 16), ('Cl', 17), ('Ar', 18), ('K', 19), ('Ca', 20),
                    ('Sc', 21), ('Ti', 22), ('V', 23), ('Cr', 24), ('Mn', 25),
                    ('Fe', 26), ('Co', 27), ('Ni', 28), ('Cu', 29), ('Zn', 30),
                    ('Ga', 31), ('Ge', 32), ('As', 33), ('Se', 34), ('Br', 35),
                    ('Kr', 36), ('Rb', 37), ('Sr', 38), ('Y', 39), ('Zr', 40),
                    ('Nb', 41), ('Mo', 42), ('Tc', 43), ('Ru', 44), ('Rh', 45),
                    ('Pd', 46), ('Ag', 47), ('Cd', 48), ('In', 49), ('Sn', 50),
                    ('Sb', 51), ('Te', 52), ('I', 53), ('Xe', 54), ('Cs', 55),
                    ('Ba', 56), ('La', 57), ('Ce', 58), ('Pr', 59), ('Nd', 60),
                    ('Pm', 61), ('Sm', 62), ('Eu', 63), ('Gd', 64), ('Tb', 65),
                    ('Dy', 66), ('Ho', 67), ('Er', 68), ('Tm', 69), ('Yb', 70),
                    ('Lu', 71), ('Hf', 72), ('Ta', 73), ('W', 74), ('Re', 75),
                    ('Os', 76), ('Ir', 77), ('Pt', 78), ('Au', 79), ('Hg', 80),
                    ('Tl', 81), ('Pb', 82), ('Bi', 83), ('Po', 84), ('At', 85),
                    ('Rn', 86), ('Fr', 87), ('Ra', 88), ('Ac', 89), ('Th', 90),
                    ('Pa', 91), ('U', 92), ('Np', 93)])

    H5_qmcpack = h5py.File(title + '.h5', 'w')
    groupApp = H5_qmcpack.create_group("application")
    CodeData = groupApp.create_dataset("code", (1, ), dtype="S5")
    CodeData[0:] = "PySCF"
    CodeVer = groupApp.create_dataset("version", (3, ), dtype="i4")
    CodeVer[0:] = 1
    CodeVer[1:] = 4
    CodeVer[2:] = 2

    natom = cell.natm

    dt = h5py.special_dtype(vlen=bytes)
    #Group Atoms
    groupAtom = H5_qmcpack.create_group("atoms")

    #Dataset Number Of Atoms
    groupAtom.create_dataset("number_of_atoms", (1, ), dtype="i4", data=natom)

    #Dataset Number Of Species
    #Species contains (Atom_Name, Atom_Number,Atom_Charge,Atom_Core)
    l_atoms = [(cell.atom_symbol(x), IonName[cell.atom_symbol(x)],
                cell.atom_charge(x), cell.atom_nelec_core(x))
               for x in range(natom)]

    d = defaultdict(list)
    for i, t in enumerate(l_atoms):
        d[t].append(i)

    idxSpeciestoAtoms = dict()
    uniq_atoms = dict()
    for i, (k, v) in enumerate(d.items()):
        idxSpeciestoAtoms[i] = v
        uniq_atoms[i] = k

    idxAtomstoSpecies = dict()
    for k, l_v in idxSpeciestoAtoms.items():
        for v in l_v:
            idxAtomstoSpecies[v] = k

    NbSpecies = len(idxSpeciestoAtoms.keys())

    groupAtom.create_dataset("number_of_species", (1, ),
                             dtype="i4",
                             data=NbSpecies)

    #Dataset positions
    MyPos = groupAtom.create_dataset("positions", (natom, 3), dtype="f8")
    for x in range(natom):
        MyPos[x:] = cell.atom_coord(x)

    #Group Atoms
    for x in range(NbSpecies):
        atmname = str(uniq_atoms[x][0])
        groupSpecies = groupAtom.create_group("species_" + str(x))
        groupSpecies.create_dataset("atomic_number", (1, ),
                                    dtype="i4",
                                    data=uniq_atoms[x][1])
        mylen = "S" + str(len(atmname))
        AtmName = groupSpecies.create_dataset("name", (1, ), dtype=mylen)
        AtmName[0:] = atmname
        groupSpecies.create_dataset("charge", (1, ),
                                    dtype="f8",
                                    data=uniq_atoms[x][2])
        groupSpecies.create_dataset("core", (1, ),
                                    dtype="f8",
                                    data=uniq_atoms[x][3])
    SpeciesID = groupAtom.create_dataset("species_ids", (natom, ), dtype="i4")

    for x in range(natom):
        SpeciesID[x:] = idxAtomstoSpecies[x]

    #Parameter Group
    GroupParameter = H5_qmcpack.create_group("parameters")
    GroupParameter.create_dataset("ECP", (1, ),
                                  dtype="b1",
                                  data=bool(cell.has_ecp()))
    bohrUnit = True
    Spin = cell.spin

    GroupParameter.create_dataset("Unit", (1, ), dtype="b1", data=bohrUnit)
    GroupParameter.create_dataset("NbAlpha", (1, ),
                                  dtype="i4",
                                  data=cell.nelec[0])
    GroupParameter.create_dataset("NbBeta", (1, ),
                                  dtype="i4",
                                  data=cell.nelec[1])
    GroupParameter.create_dataset("NbTotElec", (1, ),
                                  dtype="i4",
                                  data=cell.nelec[0] + cell.nelec[1])
    GroupParameter.create_dataset("spin", (1, ), dtype="i4", data=Spin)

    #basisset Group
    GroupBasisSet = H5_qmcpack.create_group("basisset")
    #Dataset Number Of Atoms
    GroupBasisSet.create_dataset("NbElements", (1, ),
                                 dtype="i4",
                                 data=NbSpecies)

    LCAOName = GroupBasisSet.create_dataset("name", (1, ), dtype="S8")
    LCAOName[0:] = "LCAOBSet"

    #atomicBasisSets Group
    for x in range(NbSpecies):

        MyIdx = idxAtomstoSpecies[x]
        atomicBasisSetGroup = GroupBasisSet.create_group("atomicBasisSet" +
                                                         str(x))
        mylen = "S" + str(len(uniq_atoms[x][0]))
        elemtype = atomicBasisSetGroup.create_dataset("elementType", (1, ),
                                                      dtype=mylen)
        elemtype[0:] = uniq_atoms[x][0]
        if cell.cart == True:
            Angular = atomicBasisSetGroup.create_dataset("angular", (1, ),
                                                         dtype="S9")
            ExpandYLM = atomicBasisSetGroup.create_dataset("expandYlm", (1, ),
                                                           dtype="S6")
            Angular[0:] = "cartesian"
            ExpandYLM[0:] = "Gamess"
        else:
            Angular = atomicBasisSetGroup.create_dataset("angular", (1, ),
                                                         dtype="S9")
            ExpandYLM = atomicBasisSetGroup.create_dataset("expandYlm", (1, ),
                                                           dtype="S5")
            Angular[0:] = "spherical"
            ExpandYLM[0:] = "pyscf"

        atomicBasisSetGroup.create_dataset("grid_npts", (1, ),
                                           dtype="i4",
                                           data=1001)
        atomicBasisSetGroup.create_dataset("grid_rf", (1, ),
                                           dtype="i4",
                                           data=100)
        atomicBasisSetGroup.create_dataset("grid_ri", (1, ),
                                           dtype="f8",
                                           data=1e-06)
        gridType = atomicBasisSetGroup.create_dataset("grid_type", (1, ),
                                                      dtype="S3")
        gridType[0:] = "log"

        try:
            mylen = "S" + str(len(cell.basis))
            nameBase = atomicBasisSetGroup.create_dataset("name", (1, ),
                                                          dtype=mylen)
            nameBase[0:] = cell.basis
        except:
            nameBase = atomicBasisSetGroup.create_dataset("name", (1, ),
                                                          dtype="S8")
            nameBase[0:] = "gaussian"

        Normalized = atomicBasisSetGroup.create_dataset("normalized", (1, ),
                                                        dtype="S2")
        Normalized[0:] = "no"

        nshell = cell.atom_shell_ids(MyIdx)
        n = 0
        for i in nshell:
            l = cell.bas_angular(i)
            contracted_coeffs = cell.bas_ctr_coeff(i)
            contracted_exp = cell.bas_exp(i)
            for line in zip(*contracted_coeffs):
                BasisGroup = atomicBasisSetGroup.create_group("basisGroup" +
                                                              str(n))
                basisType = BasisGroup.create_dataset("type", (1, ),
                                                      dtype="S8")
                basisType[0:] = "Gaussian"

                mylen = "S" + str(len((uniq_atoms[x][0] + str(n) + str(l))))
                RID = BasisGroup.create_dataset("rid", (1, ), dtype=mylen)
                RID[0:] = (uniq_atoms[x][0] + str(n) + str(l))

                BasisGroup.create_dataset("Shell_coord", (3, ),
                                          dtype="f8",
                                          data=cell.bas_coord(i))
                BasisGroup.create_dataset("NbRadFunc", (1, ),
                                          dtype="i4",
                                          data=cell.bas_nprim(i))
                Val_l = BasisGroup.create_dataset("l", (1, ),
                                                  dtype="i4",
                                                  data=l)
                Val_n = BasisGroup.create_dataset("n", (1, ),
                                                  dtype="i4",
                                                  data=n)
                RadGroup = BasisGroup.create_group("radfunctions")
                # print "<basisGroup",n," rid=",uniq_atoms[x][0]+str(n)+str(l)," n=",n,"  l=",l ,"NbRadFunc=",cell.bas_nprim(i),"type=Gaussian>"
                IdRad = 0

                for e, c in zip(contracted_exp, line):
                    DataRadGrp = RadGroup.create_group("DataRad" + str(IdRad))
                    DataRadGrp.create_dataset("exponent", (1, ),
                                              dtype="f8",
                                              data=e)
                    DataRadGrp.create_dataset("contraction", (1, ),
                                              dtype="f8",
                                              data=c)
                    #    print  "<radfunc exponent=",e," contraction=",c, "DataRad=",n,"IdRad=",IdRad,"/>"
                    IdRad += 1
                n += 1

        atomicBasisSetGroup.create_dataset("NbBasisGroups", (1, ),
                                           dtype="i4",
                                           data=n)

    def is_complex(l):
        try:
            return is_complex(l[0])
        except:
            return bool(l.imag)

    GroupDet = H5_qmcpack.create_group("determinant")

    if cell.cart == True:
        d_gms_order = {
            0: ["s"],
            1: ["x", "y", "z"],
            2: ["xx", "yy", "zz", "xy", "xz", "yz"],
            3: [
                "xxx", "yyy", "zzz", "xxy", "xxz", "yyx", "yyz", "zzx", "zzy",
                "xyz"
            ],
            4: [
                "xxxx", "yyyy", "zzzz", "xxxy", "xxxz", "yyyx", "yyyz", "zzzx",
                "zzzy", "xxyy", "xxzz", "yyzz", "xxyz", "yyxz", "zzxy", "xxxx",
                "yyyy", "zzzz", "xxxy", "xxxz", "yyyx", "yyyz", "zzzx", "zzzy",
                "xxyy", "xxzz", "yyzz", "xxyz", "yyxz", "zzxy"
            ]
        }

        d_l = {'s': 0, 'p': 1, 'd': 2, 'f': 3, 'g': 4}

        def n_orbital(n):
            if n == 0:
                return 1
            elif n == 1:
                return 3
            else:
                return 2 * n_orbital(n - 1) - n_orbital(n - 2) + 1

        def compare_gamess_style(item1, item2):
            # Warning:
            # 	- d_gms_order is a global variable
            n1, n2 = map(len, (item1, item2))
            assert (n1 == n2)
            try:
                l = d_gms_order[n1]
            except KeyError:
                return 0
            else:
                a = l.index(item1)
                b = l.index(item2)
                return cmp(a, b)

        ao_label = cell.ao_labels(False)

        # Create a list of shell
        l_l = []
        for label, name, t, l in ao_label:
            # Change yyx -> xyy "
            q = "".join(sorted(l, key=l.count, reverse=True))
            l_l.append(q)

        # Pyscf ordering of shell
        l_order = list(range(len(l_l)))

        # Shell ordering indexed
        n = 1
        l_order_new = []
        for i, (label, name, t, l) in enumerate(ao_label):
            r = d_l[t[-1]]
            # print r,n_orbital(r)
            if n != 1:
                n -= 1
            else:
                n = n_orbital(r)
                unordered_l = l_l[i:i + n]
                unordered = l_order[i:i + n]
                #print i,n,unordered
                ordered = [
                    x for _, x in sorted(zip(unordered_l, unordered),
                                         key=lambda p: p[0],
                                         cmp=compare_gamess_style)
                ]
                l_order_new.extend(ordered)

        def order_mo_coef(ll):
            # Order a list of transposed mo_coeff (Ao,Mo) -> (Mo,Ao) ordered
            # Warning:
            #	- l_order_new is used as global variable
            #	- gamess order

            ll_new = []
            for l in zip(*ll):
                ll_new.append([l[i] for i in l_order_new])
            return ll_new

    mo_coeff = mf.mo_coeff
    Complex = is_complex(mo_coeff)
    if Complex:
        mytype = "c16"
    else:
        mytype = "f8"

    GroupParameter.create_dataset("IsComplex", (1, ), dtype="b1", data=Complex)

    GroupParameter.create_dataset("SpinUnResticted", (1, ),
                                  dtype="b1",
                                  data=UnRestricted)
    if not PBC:
        if UnRestricted == False:
            NbMO = len(mo_coeff)
            NbAO = len(mo_coeff[0])
            if cell.cart == True:
                eigenset = GroupDet.create_dataset(
                    "eigenset_0", (NbMO, NbAO),
                    dtype="f8",
                    data=order_mo_coef(mo_coeff))
            else:
                eigenset = GroupDet.create_dataset("eigenset_0", (NbMO, NbAO),
                                                   dtype="f8",
                                                   data=zip(*mo_coeff))
        else:
            NbMO = len(mo_coeff[0])
            NbAO = len(mo_coeff[0][0])
            eigenset_up = GroupDet.create_dataset("eigenset_0", (NbMO, NbAO),
                                                  dtype="f8",
                                                  data=order_mo_coef(
                                                      mo_coeff[0]))
            eigenset_dn = GroupDet.create_dataset("eigenset_1", (NbMO, NbAO),
                                                  dtype="f8",
                                                  data=order_mo_coef(
                                                      mo_coeff[1]))
    else:
        #Cell Parameters
        GroupCell = H5_qmcpack.create_group("Cell")
        GroupCell.create_dataset("LaticeVectors", (3, 3),
                                 dtype="f8",
                                 data=cell.lattice_vectors())

        Nbkpts = len(kpts)
        GroupDet.create_dataset("Nb_Kpoints", (1, ), dtype="i4", data=Nbkpts)
        if not UnRestricted:
            NbMO = len(mo_coeff[0])
            NbAO = len(mo_coeff[0][0])
        else:
            NbMO = len(mo_coeff[0][0])
            NbAO = len(mo_coeff[0][0][0])

        def get_mo(mo_coeff, cart):
            return order_mo_coef(mo_coeff) if cart else zip(*mo_coeff)

        for i in range(Nbkpts):
            GroupKpts = GroupDet.create_group("Kpoint_" + str(i))
            GroupKpts.create_dataset("Coord", (1, 3), dtype="f8", data=kpts[i])
            GroupSpin = GroupKpts.create_group("spin_Up")
            if not UnRestricted:
                mo_coeff_ = get_mo(mo_coeff[i], cell.cart)

                GroupSpin.create_dataset("MO_Coeff", (NbMO, NbAO),
                                         dtype=mytype,
                                         data=mo_coeff_)
                GroupSpin.create_dataset("MO_EIGENVALUES", (1, NbMO),
                                         dtype="f8",
                                         data=mf.mo_energy[i])

            else:
                GroupSpindn = GroupKpts.create_group("spin_Dn")

                mo_coeff_up = get_mo(mo_coeff[0][i], cell.cart)
                mo_coeff_down = get_mo(mo_coeff[1][i], cell.cart)

                GroupSpin.create_dataset("MO_Coeff", (NbMO, NbAO),
                                         dtype=mytype,
                                         data=mo_coeff_up)
                GroupSpindn.create_dataset("MO_Coeff", (NbMO, NbAO),
                                           dtype=mytype,
                                           data=mo_coeff_down)

                GroupSpin.create_dataset("MO_EIGENVALUES", (1, NbMO),
                                         dtype="f8",
                                         data=mf.mo_energy[0][i])
                GroupSpindn.create_dataset("MO_EIGENVALUES", (1, NbMO),
                                           dtype="f8",
                                           data=mf.mo_energy[1][i])

    GroupParameter.create_dataset("COMPLEX", (1, ), dtype="i4", data=Complex)
    GroupParameter.create_dataset("numMO", (1, ), dtype="i4", data=NbMO)
    GroupParameter.create_dataset("numAO", (1, ), dtype="i4", data=NbAO)

    print 'Wavefunction successfuly saved to QMCPACK HDF5 Format'
    print 'Use: "convert4qmc -Pyscf  {}.h5" to generate QMCPACK input files'.format(
        title)
    # Close the file before exiting
    H5_qmcpack.close()
Пример #36
0
    def test_create(self):
        filename = self.getFileName("create_attribute")
        print("filename:", filename)
        f = h5py.File(filename, 'w')

        is_hsds = False
        if isinstance(f.id.id, str) and f.id.id.startswith("g-"):
            is_hsds = True  # HSDS has different permission defaults

        g1 = f.create_group('g1')

        g1.attrs['a1'] = 42

        n = g1.attrs['a1']
        self.assertEqual(n, 42)

        self.assertTrue('a1' in g1.attrs)
        self.assertTrue(u'a1' in g1.attrs)
        self.assertTrue(b'a1' in g1.attrs)

        self.assertEqual(len(g1.attrs), 1)

        g1.attrs['b1'] = list(range(10))

        # try replacing 'a1'
        g1.attrs['a1'] = 24

        self.assertEqual(len(g1.attrs), 2)

        # create an attribute with explict UTF type
        dt = h5py.special_dtype(vlen=str)
        g1.attrs.create('c1', "Hello HDF", dtype=dt)
        self.assertTrue('c1' in g1.attrs)
        value = g1.attrs['c1']
        self.assertEqual(value, "Hello HDF")

        # create attribute with as a fixed length string
        g1.attrs.create('d1', np.string_("This is a numpy string"))
        value = g1.attrs['d1']

        attr_names = []
        for a in g1.attrs:
            attr_names.append(a)
        self.assertEqual(len(attr_names), 4)
        self.assertTrue('a1' in attr_names)
        self.assertTrue('b1' in attr_names)
        self.assertTrue('c1' in attr_names)
        self.assertTrue('d1' in attr_names)

        # create an array attribute
        g1.attrs["ones"] = np.ones((10, ))
        arr = g1.attrs["ones"]
        self.assertTrue(isinstance(arr, np.ndarray))
        self.assertEqual(arr.shape, (10, ))
        for i in range(10):
            self.assertEqual(arr[i], 1)

        # array of strings
        g1.attrs['strings'] = [np.string_("Hello"), np.string_("Good-bye")]
        arr = g1.attrs['strings']
        self.assertEqual(arr.shape, (2, ))
        self.assertEqual(arr[0], b"Hello")
        self.assertEqual(arr[1], b"Good-bye")
        #if six.PY3:
        #    self.assertEqual(arr.dtype, h5py.special_dtype(vlen=str))
        #else:
        self.assertEqual(arr.dtype.kind, 'S')
        # TBD - h5serv is returning S11 here for some reason
        #self.assertEqual(arr.dtype, np.dtype("S8"))

        # scalar byte values
        g1.attrs['e1'] = "Hello"
        s = g1.attrs['e1']
        self.assertEqual(s, "Hello")

        # scalar objref attribute
        g11 = g1.create_group('g1.1')  # create subgroup g1/g1.1
        g11.attrs['name'] = 'g1.1'  # tag group with an attribute

        if is_hsds:
            # following is not working with h5serv
            g11_ref = g11.ref  # get ref to g1/g1.1
            self.assertTrue(isinstance(g11_ref, h5py.Reference))
            refdt = h5py.special_dtype(ref=h5py.Reference)  # create ref dtype
            g1.attrs.create('f1', g11_ref,
                            dtype=refdt)  # create attribute with ref to g1.1
            ref = g1.attrs['f1']  # read back the attribute

            refobj = f[ref]  # get the ref'd object
            self.assertTrue('name'
                            in refobj.attrs)  # should see the tag attribute
            self.assertEqual(refobj.attrs['name'], 'g1.1')  # check tag value

        # close file
        f.close()
Пример #37
0
    print("ouput file:", config.output_file)
    f = h5py.File(config.output_file, 'w')

    total_rows = 0
    for input_file in config.input_files:
        if not os.path.isfile(input_file):
            raise ValueError(input_file + " does not exist")
        npz_file = np.load(input_file)
        total_rows += npz_file['event_id'].shape[0]

    dset_labels = f.create_dataset("labels",
                                   shape=(total_rows, ),
                                   dtype=np.int32)
    dset_PATHS = f.create_dataset("root_files",
                                  shape=(total_rows, ),
                                  dtype=h5py.special_dtype(vlen=str))
    dset_IDX = f.create_dataset("event_ids",
                                shape=(total_rows, ),
                                dtype=np.int32)
    dset_event_data = f.create_dataset("event_data",
                                       shape=(total_rows, 27, 27, 38),
                                       dtype=np.float32)
    dset_energies = f.create_dataset("energies",
                                     shape=(total_rows, 1),
                                     dtype=np.float32)
    dset_positions = f.create_dataset("positions",
                                      shape=(total_rows, 1, 3),
                                      dtype=np.float32)
    dset_angles = f.create_dataset("angles",
                                   shape=(total_rows, 2),
                                   dtype=np.float32)
Пример #38
0
	def storeClassLabels(self, classLabels):
		dt = h5py.special_dtype(vlen=str) 
		labelSet = self.db.create_dataset("label_names",
			(len(classLabels),), dtype=dt)
		labelSet[:] = classLabels
Пример #39
0
def prepare_data(input_folder, output_file, size, input_channels, target_resolution):
    '''
    Main function that prepares a dataset from the raw challenge data to an hdf5 dataset
    '''

    if len(size) != 3:
        raise AssertionError('Inadequate number of size parameters')
    if len(target_resolution) != 3:
        raise AssertionError('Inadequate number of target resolution parameters')


    hdf5_file = h5py.File(output_file, "w")

    file_list = {'test': [], 'train': [], 'validation': []}

    logging.info('Counting files and parsing meta data...')

    pid = 0
    for folder in os.listdir(input_folder):
        print(folder)
        train_test = test_train_val_split(pid)
        pid = pid + 1
        file_list[train_test].append(folder)


    n_train = len(file_list['train'])
    n_test = len(file_list['test'])
    n_val = len(file_list['validation'])

    print('Debug: Check if sets add up to correct value:')
    print(n_train, n_val, n_test, n_train + n_val + n_test)

    # Create datasets for images and masks
    data = {}
    for tt, num_points in zip(['test', 'train', 'validation'], [n_test, n_train, n_val]):

        if num_points > 0:
            print([num_points] + list(size) + [input_channels])
            data['images_%s' % tt] = hdf5_file.create_dataset("images_%s" % tt, [num_points] + list(size) + [input_channels],
                                                              dtype=np.float32)
            data['masks_%s' % tt] = hdf5_file.create_dataset("masks_%s" % tt, [num_points] + list(size), dtype=np.uint8)
            data['pids_%s' % tt] = hdf5_file.create_dataset("pids_%s" % tt, [num_points] , dtype=h5py.special_dtype(vlen=str))

    mask_list = {'test': [], 'train': [], 'validation': []}
    img_list = {'test': [], 'train': [], 'validation': []}
    pids_list = {'test': [], 'train': [], 'validation': []}

    logging.info('Parsing image files')

    #get max dimension in z-axis
    maxX = 0
    maxY = 0
    maxZ = 0
    # maxXCropped = 0
    # maxYCropped = 0
    # maxZCropped = 0
    i = 0
    for train_test in ['test', 'train', 'validation']:
        for folder in file_list[train_test]:
            print("Doing file {}".format(i))
            i += 1

            baseFilePath = os.path.join(input_folder, folder, folder)
            img_c1, _, img_header = utils.load_nii(baseFilePath + "_t1.nii.gz")
            img_c2, _, _ = utils.load_nii(baseFilePath + "_t1ce.nii.gz")
            img_c3, _, _ = utils.load_nii(baseFilePath + "_t2.nii.gz")
            img_c4, _, _ = utils.load_nii(baseFilePath + "_flair.nii.gz")
            img_dat = np.stack((img_c1, img_c2, img_c3, img_c4), 3)

            maxX = max(maxX, img_dat.shape[0])
            maxY = max(maxY, img_dat.shape[1])
            maxZ = max(maxZ, img_dat.shape[2])
            # img_dat_cropped = crop_volume_allDim(img_dat)
            # maxXCropped = max(maxXCropped, img_dat_cropped.shape[0])
            # maxYCropped = max(maxYCropped, img_dat_cropped.shape[1])
            # maxZCropped = max(maxZCropped, img_dat_cropped.shape[2])
    print("Max x: {}, y: {}, z: {}".format(maxX, maxY, maxZ))
    # print("Max cropped x: {}, y: {}, z: {}".format(maxXCropped, maxYCropped, maxZCropped))

    for train_test in ['train', 'test', 'validation']:

        write_buffer = 0
        counter_from = 0

        for folder in file_list[train_test]:

            logging.info('-----------------------------------------------------------')
            logging.info('Doing: %s' % folder)

            patient_id = folder

            baseFilePath = os.path.join(input_folder, folder, folder)
            img_c1, _, img_header = utils.load_nii(baseFilePath + "_t1.nii.gz")
            img_c2, _, _ = utils.load_nii(baseFilePath + "_t1ce.nii.gz")
            img_c3, _, _ = utils.load_nii(baseFilePath + "_t2.nii.gz")
            img_c4, _, _ = utils.load_nii(baseFilePath + "_flair.nii.gz")
            mask, _, _ = utils.load_nii(baseFilePath + "_seg.nii.gz")

            img = np.stack((img_c1, img_c2, img_c3, img_c4), 3)

            # img, mask = crop_volume_allDim(img_dat.copy(), mask_dat.copy())

            pixel_size = (img_header.structarr['pixdim'][1],
                          img_header.structarr['pixdim'][2],
                          img_header.structarr['pixdim'][3])

            logging.info('Pixel size:')
            logging.info(pixel_size)

            ### PROCESSING LOOP FOR 3D DATA ################################

            scale_vector = [pixel_size[0] / target_resolution[0],
                            pixel_size[1] / target_resolution[1],
                            pixel_size[2]/ target_resolution[2]]

            if scale_vector != [1.0, 1.0, 1.0]:
                img = transform.rescale(img, scale_vector, order=1, preserve_range=True, multichannel=True, mode='constant')
                mask = transform.rescale(mask, scale_vector, order=0, preserve_range=True, multichannel=False, mode='constant')

            img = crop_or_pad_slice_to_size(img, size, input_channels)
            mask = crop_or_pad_slice_to_size(mask, size)

            img = normalise_image(img)

            img_list[train_test].append(img)
            mask_list[train_test].append(mask)
            pids_list[train_test].append(patient_id)

            write_buffer += 1

            if write_buffer >= MAX_WRITE_BUFFER:

                counter_to = counter_from + write_buffer
                _write_range_to_hdf5(data, train_test, img_list, mask_list, pids_list, counter_from, counter_to)
                _release_tmp_memory(img_list, mask_list, pids_list, train_test)

                # reset stuff for next iteration
                counter_from = counter_to
                write_buffer = 0

        logging.info('Writing remaining data')
        counter_to = counter_from + write_buffer

        if len(file_list[train_test]) > 0:
            _write_range_to_hdf5(data, train_test, img_list, mask_list, pids_list, counter_from, counter_to)
        _release_tmp_memory(img_list, mask_list, pids_list, train_test)

    # After test train loop:
    hdf5_file.close()
Пример #40
0
def pretrain_save_data_parameters(
    data_dir,
    speakers_file='speakers.list',
    params_file='pretrain_params.h5',
):
    # TODO Document this function
    # Save processing start time
    start_time = time()

    print('Starting')

    longest_sequence = 0
    files_list = []

    num_spk = len([entry for entry in scandir(data_dir) if entry.is_dir()])

    spk_max = np.zeros((num_spk, 42))
    spk_min = 1e+50 * np.ones((num_spk, 42))

    speakers = open(os.path.join(data_dir, speakers_file), 'r').readlines()
    # Strip '\n' characters
    dirs = [line.split('\n')[0] for line in speakers]

    print("Processing speakers' data")
    for spk_index, a_dir in enumerate(dirs):
        for sub_root, _, sub_files in os.walk(os.path.join(data_dir, a_dir)):
            # Get basenames of files in directory
            basenames = list(
                set([
                    os.path.join(sub_root,
                                 file.split('.')[0]) for file in sub_files
                ]))

            files_list += basenames

            for basename in basenames:
                print('Processing ' + basename)

                lf0_params = parse_file(1, basename + '.lf0_log')

                if lf0_params.shape[0] > longest_sequence:
                    longest_sequence = lf0_params.shape[0]

                mcp_params = parse_file(40, basename + '.cc')

                mvf_params = parse_file(1, basename + '.i.fv')

                seq_params = np.concatenate(
                    (mcp_params, lf0_params, mvf_params), axis=1)

                # Compute maximum and minimum values
                spk_max[spk_index, :] = np.maximum(
                    spk_max[spk_index, :], np.ma.max(seq_params, axis=0))
                spk_min[spk_index, :] = np.minimum(
                    spk_min[spk_index, :], np.ma.min(seq_params, axis=0))

    print('Saving data to .h5 file')

    with File(os.path.join(data_dir, params_file), 'w') as f:
        # Save longest_sequence and the max and min values as attributes
        f.attrs.create('longest_sequence', longest_sequence, dtype=int)
        f.attrs.create('speakers_max', spk_max)
        f.attrs.create('speakers_min', spk_min)

        # TODO Support Python 2
        # sys.version_info -> Get running Python version
        dt = special_dtype(vlen=str)

        utf_list = [
            n.encode(encoding="utf-8", errors="ignore") for n in files_list
        ]
        f.create_dataset(name='files_list',
                         shape=(len(utf_list), 1),
                         data=utf_list,
                         dtype=dt)

        f.close()

    print('Elapsed time: ' + display_time(time() - start_time))
    longest_sequence = int(np.floor(longest_sequence * 1.7))

    return longest_sequence, spk_max, spk_min, files_list
Пример #41
0
import h5py
import io
import numpy as np


data_dir = "data_10000" + "/mnt/ramdisk/max/90kDICT32px/"

level_0 = glob.glob(os.path.join(data_dir + "/*"))

if len(level_0) == 0:
  raise ValueError("No files in directoy" + data_dir)

filename = "hdf5-numpy-10000.hdf5"
if os.path.exists(filename): os.remove(filename)
f = h5py.File(filename)
dt = h5py.special_dtype(vlen=np.dtype('uint8'))
dset = f.create_dataset('images', (10000, ), dtype=dt)

################################# SQL file with meta info
### sqlite file
filename = "hdf5-numpy-10000.sqlite"
if os.path.exists(filename): os.remove(filename)
sql_conn = sqlite3.connect(filename)
cur = sql_conn.cursor()
sql_table = 'CREATE TABLE IF NOT EXISTS meta (key integer PRIMARY KEY, label text NOT NULL, path text, shape0 integer, shape1 integer, shape2 integer);'
cur.execute(sql_table)

#################################
files_counter = 0

tic = time.time()
Пример #42
0
 def contributors(self, value: list[str]):
     self._contributors = np.asarray(value,
                                     dtype=h5py.special_dtype(vlen=str))
Пример #43
0
def H5AnnotationFile(annotype, annoid, kv=None):
    """Create an HDF5 file and populate the fields. Return a file object.
      This is a support routine for all the RAMON tests."""

    # Create an in-memory HDF5 file
    tmpfile = tempfile.NamedTemporaryFile()
    h5fh = h5py.File(tmpfile.name)

    # Create the top level annotation id namespace
    idgrp = h5fh.create_group(str(annoid))

    # Annotation type
    idgrp.create_dataset("ANNOTATION_TYPE", (1, ), np.uint32, data=annotype)

    # Create a metadata group
    mdgrp = idgrp.create_group("METADATA")

    # now lets add a bunch of random values for the specific annotation type
    ann_status = random.randint(0, 4)
    ann_confidence = random.random()
    ann_author = 'randal'

    # Set Annotation specific metadata
    mdgrp.create_dataset("STATUS", (1, ), np.uint32, data=ann_status)
    mdgrp.create_dataset("CONFIDENCE", (1, ), np.float, data=ann_confidence)
    mdgrp.create_dataset("AUTHOR", (1, ),
                         dtype=h5py.special_dtype(vlen=str),
                         data=ann_author)

    kvpairs = {}
    if kv != None:
        [k, sym, v] = kv.partition(':')
        kvpairs[k] = v

        # Turn our dictionary into a csv file
        fstring = cStringIO.StringIO()
        csvw = csv.writer(fstring, delimiter=',')
        csvw.writerows([r for r in kvpairs.iteritems()])

        # User-defined metadata
        mdgrp.create_dataset("KVPAIRS", (1, ),
                             dtype=h5py.special_dtype(vlen=str),
                             data=fstring.getvalue())

    # Synapse:
    if annotype == 2:

        syn_weight = random.random() * 1000.0
        syn_synapse_type = random.randint(1, 9)
        syn_seeds = [random.randint(1, 1000) for x in range(5)]
        syn_segments = [[random.randint(1, 1000),
                         random.randint(1, 1000)] for x in range(4)]

        mdgrp.create_dataset("WEIGHT", (1, ), np.float, data=syn_weight)
        mdgrp.create_dataset("SYNAPSE_TYPE", (1, ),
                             np.uint32,
                             data=syn_synapse_type)
        mdgrp.create_dataset("SEEDS", (len(syn_seeds), ),
                             np.uint32,
                             data=syn_seeds)
        mdgrp.create_dataset("SEGMENTS", (len(syn_segments), 2),
                             np.uint32,
                             data=syn_segments)

    # Seed
    elif annotype == 3:

        seed_parent = random.randint(1, 1000)
        seed_position = [random.randint(1, 10000) for x in range(3)]
        seed_cubelocation = random.randint(1, 9)
        seed_source = random.randint(1, 1000)

        mdgrp.create_dataset("PARENT", (1, ), np.uint32, data=seed_parent)
        mdgrp.create_dataset("CUBE_LOCATION", (1, ),
                             np.uint32,
                             data=seed_cubelocation)
        mdgrp.create_dataset("SOURCE", (1, ), np.uint32, data=seed_source)
        mdgrp.create_dataset("POSITION", (3, ), np.uint32, data=seed_position)

    # Segment
    elif annotype == 4:

        seg_parentseed = random.randint(1, 100000)
        seg_segmentclass = random.randint(1, 9)
        seg_neuron = random.randint(1, 100000)
        seg_synapses = [random.randint(1, 100000) for x in range(5)]
        seg_organelles = [random.randint(1, 100000) for x in range(5)]

        mdgrp.create_dataset("SEGMENTCLASS", (1, ),
                             np.uint32,
                             data=seg_segmentclass)
        mdgrp.create_dataset("PARENTSEED", (1, ),
                             np.uint32,
                             data=seg_parentseed)
        mdgrp.create_dataset("NEURON", (1, ), np.uint32, data=seg_neuron)
        mdgrp.create_dataset("SYNAPSES", (len(seg_synapses), ), np.uint32,
                             seg_synapses)
        mdgrp.create_dataset("ORGANELLES", (len(seg_organelles), ), np.uint32,
                             seg_organelles)

    # Neuron
    elif annotype == 5:

        neuron_segments = [random.randint(1, 1000) for x in range(10)]
        mdgrp.create_dataset("SEGMENTS", (len(neuron_segments), ), np.uint32,
                             neuron_segments)

    # Organelle
    elif annotype == 6:

        org_parentseed = random.randint(1, 100000)
        org_organelleclass = random.randint(1, 9)
        org_seeds = [random.randint(1, 100000) for x in range(5)]
        org_centroid = [random.randint(1, 10000) for x in range(3)]

        mdgrp.create_dataset("ORGANELLECLASS", (1, ),
                             np.uint32,
                             data=org_organelleclass)
        mdgrp.create_dataset("PARENTSEED", (1, ),
                             np.uint32,
                             data=org_parentseed)
        mdgrp.create_dataset("SEEDS", (len(org_seeds), ), np.uint32, org_seeds)
        mdgrp.create_dataset("CENTROID", (3, ), np.uint32, data=org_centroid)

    h5fh.flush()
    tmpfile.seek(0)
    return tmpfile
Пример #44
0
def utf8(string):
    if isinstance(string, bytes):
        return bytes.decode(string, 'utf-8')
    return string

def py_str(byte_string):
    if isinstance(byte_string, np.ndarray):
        byte_string = bytes(byte_string)
    assert isinstance(byte_string, bytes)
    return byte_string.decode('ASCII')

def isstring(s):
    return isinstance(s, str)

def execcode(code, globals, locals=None):
    if locals is None:
        exec(code, globals)
    else:
        exec(code, globals, locals)

h5vstring = h5py.special_dtype(vlen=bytes)

import builtins
import activepapers.builtins3 as ap_builtins
# Replace the "del exec" in builtins3 by something that's not a
# syntax error under Python 2.
del ap_builtins.__dict__['exec']

raw_input = builtins.input
Пример #45
0
def createSpecificSynapse(annoid, syn_segments, cutout):

    # Create an in-memory HDF5 file
    tmpfile = tempfile.NamedTemporaryFile()
    h5fh = h5py.File(tmpfile.name)

    # Create the top level annotation id namespace
    idgrp = h5fh.create_group(str(annoid))

    # Annotation type
    idgrp.create_dataset("ANNOTATION_TYPE", (1, ), np.uint32, data=2)

    # Create a metadata group
    mdgrp = idgrp.create_group("METADATA")

    # now lets add a bunch of random values for the specific annotation type
    ann_status = random.randint(0, 4)
    ann_confidence = random.random()
    ann_author = 'randal'

    # Set Annotation specific metadata
    mdgrp.create_dataset("STATUS", (1, ), np.uint32, data=ann_status)
    mdgrp.create_dataset("CONFIDENCE", (1, ), np.float, data=ann_confidence)
    mdgrp.create_dataset("AUTHOR", (1, ),
                         dtype=h5py.special_dtype(vlen=str),
                         data=ann_author)

    syn_weight = random.random() * 1000.0
    syn_synapse_type = random.randint(1, 9)

    [resstr, xstr, ystr, zstr] = cutout.split('/')
    (xlowstr, xhighstr) = xstr.split(',')
    (ylowstr, yhighstr) = ystr.split(',')
    (zlowstr, zhighstr) = zstr.split(',')

    resolution = int(resstr)
    xlow = int(xlowstr)
    xhigh = int(xhighstr)
    ylow = int(ylowstr)
    yhigh = int(yhighstr)
    zlow = int(zlowstr)
    zhigh = int(zhighstr)

    anndata = np.ones([zhigh - zlow, yhigh - ylow, xhigh - xlow])
    #import pdb; pdb.set_trace()
    mdgrp.create_dataset("WEIGHT", (1, ), np.float, data=syn_weight)
    mdgrp.create_dataset("SYNAPSE_TYPE", (1, ),
                         np.uint32,
                         data=syn_synapse_type)
    mdgrp.create_dataset("SEGMENTS", (len(syn_segments), 2),
                         np.uint32,
                         data=syn_segments)
    idgrp.create_dataset("RESOLUTION", (1, ), np.uint32, data=resolution)
    idgrp.create_dataset("XYZOFFSET", (3, ),
                         np.uint32,
                         data=[xlow, ylow, zlow])
    idgrp.create_dataset("CUTOUT", anndata.shape, np.uint32, data=anndata)

    h5fh.flush()
    tmpfile.seek(0)
    return tmpfile
Пример #46
0
def load_data(path,
              file_ext=['txt'],
              valid_split=None,
              vocab_file_name=None,
              max_vocab_size=None,
              max_len_w=None,
              output_path=None,
              subset_pct=100):
    """
    Given a path where data are saved, look for the ones with the right extensions
    If a split factor is given, it will split all the files into training and valid
    set. Then build vocabulary from the training and validation sets.

    Arguments:
        path: which directory to look for all the documents
        file_ext: what extension of the files to look for
        valid_split: to split the data into train/valid set. If None, no split
        vocab_file_name: optional file name. If None, the script will decide a name
                         given path and split
        max_vocab_size: maximum number of words to use in vocabulary (by most frequent)
        max_len_w: maximum length of sentences in words
        output_path: path used to save preprocessed data and resuts
        subset_pct: subset of dataset to load into H5 file (percentage)

    Returns:
        The function saves 2 files:
        h5 file with preprocessed data
        vocabulary file with: vocab, reverse_vocab, word_count
    """
    file_names = get_file_list(path, file_ext)

    file_str = get_file_str(path,
                            len(file_names),
                            labelled=False,
                            valid_split=valid_split,
                            subset_pct=subset_pct)

    # create output dir if needed
    if not os.path.isdir(output_path):
        os.makedirs(output_path)

    # file name to store the vocabulary
    if vocab_file_name is None:
        vocab_file_name = file_str + '.vocab'
        vocab_file_name = os.path.join(output_path, vocab_file_name)

    # If max sizes arent set, assume no limit
    if not max_len_w:
        max_len_w = sys.maxsize
    if not max_vocab_size:
        max_vocab_size = sys.maxsize

    # file name to store the pre-processed train/valid dataset
    h5_file_name = os.path.join(output_path, file_str + '.h5')

    if os.path.exists(h5_file_name) and os.path.exists(vocab_file_name):
        neon_logger.display(
            "dataset files {} and vocabulary file {} already exist. "
            "will use cached data. ".format(h5_file_name, vocab_file_name))
        return h5_file_name, vocab_file_name

    # split into training/valid set
    if valid_split is not None:
        if 'json' in file_ext:
            # Split based on number of files
            train_split = int(np.ceil(len(file_names) * (1 - valid_split)))
            train_files = file_names[:train_split]
            valid_files = file_names[train_split:]

            train_sent = load_json_sent(train_files, subset_pct)
            valid_sent = load_json_sent(valid_files, subset_pct)
            all_sent = train_sent + valid_sent
        elif 'txt' in file_ext:
            # Split based on number of lines (since only 2 files)
            all_sent = load_txt_sent(file_names, subset_pct)
            train_split = int(np.ceil(len(all_sent) * (1 - valid_split)))

            train_sent = all_sent[:train_split]
            valid_sent = all_sent[train_split:]
        else:
            neon_logger.display(
                "Unsure how to load file_ext {}, please use 'json' or 'txt'.".
                format(file_ext))
    else:
        train_files = file_names
        if 'json' in file_ext:
            train_sent = load_json_sent(train_files, subset_pct)
        elif 'txt' in file_ext:
            train_sent = load_txt_sent(train_files, subset_pct)
        else:
            neon_logger.display(
                "Unsure how to load file_ext {}, please use 'json' or 'txt'.".
                format(file_ext))
        all_sent = train_sent

    if os.path.exists(vocab_file_name):
        neon_logger.display(
            "open existing vocab file: {}".format(vocab_file_name))
        vocab, rev_vocab, word_count = load_obj(vocab_file_name)
    else:
        neon_logger.display("Building  vocab file")

        # build vocab
        word_count = defaultdict(int)
        for sent in all_sent:
            sent_words = tokenize(sent)

            if len(sent_words) > max_len_w or len(sent_words) == 0:
                continue

            for word in sent_words:
                word_count[word] += 1

        # sort the word_count , re-assign ids by its frequency. Useful for downstream tasks
        # only done for train vocab
        vocab_sorted = sorted(word_count.items(),
                              key=lambda kv: kv[1],
                              reverse=True)

        vocab = OrderedDict()

        # get word count as array in same ordering as vocab (but with maximum length)
        word_count_ = np.zeros((len(word_count), ), dtype=np.int64)
        for i, t in enumerate(list(zip(*vocab_sorted))[0][:max_vocab_size]):
            word_count_[i] = word_count[t]
            vocab[t] = i
        word_count = word_count_

        # generate the reverse vocab
        rev_vocab = dict((wrd_id, wrd) for wrd, wrd_id in vocab.items())

        neon_logger.display("vocabulary from {} is saved into {}".format(
            path, vocab_file_name))
        save_obj((vocab, rev_vocab, word_count), vocab_file_name)

    vocab_size = len(vocab)
    neon_logger.display(
        "\nVocab size from the dataset is: {}".format(vocab_size))

    neon_logger.display(
        "\nProcessing and saving training data into {}".format(h5_file_name))

    # now process and save the train/valid data
    h5f = h5py.File(h5_file_name, 'w', libver='latest')
    shape, maxshape = (len(train_sent), ), (None)
    dt = np.dtype([('text', h5py.special_dtype(vlen=str)),
                   ('num_words', np.uint16)])
    report_text_train = h5f.create_dataset('report_train',
                                           shape=shape,
                                           maxshape=maxshape,
                                           dtype=dt,
                                           compression='gzip')
    report_train = h5f.create_dataset('train',
                                      shape=shape,
                                      maxshape=maxshape,
                                      dtype=h5py.special_dtype(vlen=np.int32),
                                      compression='gzip')

    # map text to integers
    wdata = np.zeros((1, ), dtype=dt)
    ntrain = 0
    for sent in train_sent:
        text_int = [-1 if t not in vocab else vocab[t] for t in tokenize(sent)]

        # enforce maximum sentence length
        if len(text_int) > max_len_w or len(text_int) == 0:
            continue

        report_train[ntrain] = text_int

        wdata['text'] = clean_string(sent)
        wdata['num_words'] = len(text_int)
        report_text_train[ntrain] = wdata
        ntrain += 1

    report_train.attrs['nsample'] = ntrain
    report_train.attrs['vocab_size'] = vocab_size
    report_text_train.attrs['nsample'] = ntrain
    report_text_train.attrs['vocab_size'] = vocab_size

    if valid_split:
        neon_logger.display(
            "\nProcessing and saving validation data into {}".format(
                h5_file_name))
        shape = (len(valid_sent), )
        report_text_valid = h5f.create_dataset('report_valid',
                                               shape=shape,
                                               maxshape=maxshape,
                                               dtype=dt,
                                               compression='gzip')
        report_valid = h5f.create_dataset(
            'valid',
            shape=shape,
            maxshape=maxshape,
            dtype=h5py.special_dtype(vlen=np.int32),
            compression='gzip')
        nvalid = 0
        for sent in valid_sent:
            text_int = [
                -1 if t not in vocab else vocab[t] for t in tokenize(sent)
            ]

            # enforce maximum sentence length
            if len(text_int) > max_len_w or len(text_int) == 0:
                continue

            report_valid[nvalid] = text_int
            wdata['text'] = clean_string(sent)
            wdata['num_words'] = len(text_int)
            report_text_valid[nvalid] = wdata
            nvalid += 1

        report_valid.attrs['nsample'] = nvalid
        report_valid.attrs['vocab_size'] = vocab_size
        report_text_valid.attrs['nsample'] = nvalid
        report_text_valid.attrs['vocab_size'] = vocab_size

    h5f.close()

    return h5_file_name, vocab_file_name
Пример #47
0
 def WriteModelVariables(self):
     scalarVariables = self.modelVariable
     # Get maximum length of string vectors
     #maxLenName = self._getMaxLength(scalarVariables.keys())
     #maxLenDescription = self._getMaxLength([x.description for x in scalarVariables.values()])
     # Create dtype object
     numpyDataType = numpy.dtype({
         'names': [
             'name', 'simpleTypeRow', 'causality', 'variability',
             'description', 'objectId', 'column', 'negated'
         ],
         'formats': [
             h5py.special_dtype(
                 vlen=unicode),  #'S' + str(max(maxLenName, 1)),
             'uint32',
             h5py.special_dtype(enum=(numpy.uint8,
                                      CausalityType)),  # 'uint8',
             h5py.special_dtype(enum=(numpy.uint8,
                                      VariabilityType)),  # 'uint8',
             h5py.special_dtype(
                 vlen=unicode),  #'S' + str(max(maxLenDescription, 1)),
             h5py.special_dtype(ref=h5py.Reference),
             'uint32',
             h5py.special_dtype(enum=(numpy.uint8, {
                 'false': 0,
                 'true': 1
             }))
         ]
     })  # 'uint8']})
     self.description = self.file.create_group("ModelDescription")
     # Write information on Simulation group
     description = self.modelDescription
     self.description.attrs['modelName'] = description.modelName
     self.description.attrs['description'] = description.description
     self.description.attrs['author'] = description.author
     self.description.attrs['version'] = description.version
     self.description.attrs['generationTool'] = description.generationTool
     self.description.attrs[
         'generationDateAndTime'] = description.generationDateAndTime
     self.description.attrs[
         'variableNamingConvention'] = description.variableNamingConvention
     dataset = self.description.create_dataset(
         'Variables', (len(scalarVariables), 1),
         dtype=numpyDataType,
         maxshape=(len(scalarVariables), 1),
         compression='gzip')
     # Sort Variables by names
     nameList = [x for x in scalarVariables.keys()]
     nameList.sort()
     allData = []
     i = -1
     for variableName in nameList:
         variable = scalarVariables[variableName]
         i += 1
         variable.rowIndex = i
         x = variableName
         allData.append((x, variable.simpleTypeRow, variable.causality,
                         variable.variability, variable.description,
                         variable.category.dataset.ref,
                         variable.columnIndex, variable.aliasNegated))
     dataset[:, 0] = allData
     return
Пример #48
0
    def test_create_vlen(self):
        filename = self.getFileName("create_vlen_attribute")
        print("filename:", filename)
        f = h5py.File(filename, 'w')

        is_hsds = False
        if isinstance(f.id.id, str) and f.id.id.startswith("g-"):
            is_hsds = True  # HSDS has different permission defaults
        if not is_hsds:
            # vlen ref types not working for h5serv, so abort here
            f.close()
            return

        g1 = f.create_group('g1')
        g1_1 = g1.create_group('g1_1')
        g1_1.attrs["name"] = 'g1_1'
        g1_2 = g1.create_group('g1_2')
        g1_2.attrs["name"] = 'g1_2'
        g1_3 = g1.create_group('g1_3')
        g1_3.attrs["name"] = 'g1_3'

        # create an attribute that is a VLEN int32
        dtvlen = h5py.special_dtype(vlen=np.dtype('int32'))
        e0 = np.array([0, 1, 2])
        e1 = np.array([0, 1, 2, 3])
        data = np.array([e0, e1], dtype=object)

        g1.attrs.create("a1", data, shape=(2, ), dtype=dtvlen)

        ret_val = g1.attrs["a1"]
        self.assertTrue(isinstance(ret_val, np.ndarray))
        self.assertEqual(len(ret_val), 2)
        self.assertTrue(isinstance(ret_val[0], np.ndarray))
        # py36  attribute[a1]: [array([0, 1, 2], dtype=int32) array([0, 1, 2, 3], dtype=int32)]
        # py27  [(0, 1, 2) (0, 1, 2, 3)]
        self.assertEqual(list(ret_val[0]), [0, 1, 2])
        self.assertEqual(ret_val[0].dtype, np.dtype('int32'))
        self.assertTrue(isinstance(ret_val[1], np.ndarray))
        self.assertEqual(ret_val[1].dtype, np.dtype('int32'))

        self.assertEqual(list(ret_val[1]), [0, 1, 2, 3])

        # create an attribute that is VLEN ObjRef
        dtref = h5py.special_dtype(ref=h5py.Reference)
        dtvlen = h5py.special_dtype(vlen=dtref)
        e0 = np.array((g1_1.ref, ), dtype=dtref)
        e1 = np.array((g1_1.ref, g1_2.ref), dtype=dtref)
        e2 = np.array((g1_1.ref, g1_2.ref, g1_3.ref), dtype=dtref)
        data = [e0, e1, e2]

        g1.attrs.create("b1", data, shape=(3, ), dtype=dtvlen)

        vlen_val = g1.attrs["b1"]  # read back attribute
        self.assertTrue(isinstance(vlen_val, np.ndarray))
        self.assertEqual(len(vlen_val), 3)
        for i in range(3):
            e = vlen_val[i]
            self.assertTrue(isinstance(e, np.ndarray))
            ref_type = h5py.check_dtype(ref=e.dtype)
            self.assertEqual(ref_type, h5py.Reference)
            self.assertEqual(e.shape, ((i + 1), ))
            # first element is always a ref to g1
            refd_group = f[e[0]]
            self.assertEqual(refd_group.attrs['name'], 'g1_1')

        # create an attribute with compound type of vlen objref and int32
        dtcompound = np.dtype([('refs', dtvlen), ('number', 'int32')])
        # create np array with data for the attribute
        # note: two step process is needed, see: https://github.com/h5py/h5py/issues/573
        data = np.zeros((2, ), dtype=dtcompound)
        data[0] = (e1, 1)
        data[1] = (e2, 2)

        g1.attrs.create("c1", data, shape=(2, ), dtype=dtcompound)
        compound_val = g1.attrs["c1"]
        self.assertTrue(isinstance(compound_val, np.ndarray))
        self.assertEqual(len(compound_val), 2)
        self.assertEqual(len(compound_val.dtype), 2)
        for i in range(2):
            item = compound_val[i]
            self.assertTrue(isinstance(item, np.void))
            self.assertEqual(len(item), 2)
            e = item[0]
            self.assertEqual(len(e), i + 2)
            refd_group = f[e[0]]
            self.assertEqual(refd_group.attrs['name'], 'g1_1')
            self.assertEqual(item[1], i + 1)

        # close file
        f.close()
Пример #49
0
  def addAnno ( self, annotype, annoid, kv=None ):
    """Add an annotation to the file."""

    # Create the top level annotation id namespace
    idgrp = self.h5fh.create_group ( str(annoid) )

    # Annotation type
    idgrp.create_dataset ( "ANNOTATION_TYPE", (1,), np.uint32, data=annotype )

    # Create a metadata group
    mdgrp = idgrp.create_group ( "METADATA" )

    # now lets add a bunch of random values for the specific annotation type
    ann_status = random.randint(0,4)
    ann_confidence = random.random()
    ann_author = 'randal'

    # Set Annotation specific metadata
    mdgrp.create_dataset ( "STATUS", (1,), np.uint32, data=ann_status )
    mdgrp.create_dataset ( "CONFIDENCE", (1,), np.float, data=ann_confidence )
    mdgrp.create_dataset ( "AUTHOR", (1,), dtype=h5py.special_dtype(vlen=str), data=ann_author )

    kvpairs={}
    if kv!= None:
      [ k, sym, v ] = kv.partition(':')
      kvpairs[k]=v

      # Turn our dictionary into a csv file
      fstring = cStringIO.StringIO()
      csvw = csv.writer(fstring, delimiter=',')
      csvw.writerows([r for r in kvpairs.iteritems()])

      # User-defined metadata
      mdgrp.create_dataset ( "KVPAIRS", (1,), dtype=h5py.special_dtype(vlen=str), data=fstring.getvalue())

    # Synapse:
    if annotype == 2:

      syn_weight = random.random()*1000.0
      syn_synapse_type = random.randint(1,9)
      syn_seeds = [ random.randint(1,1000) for x in range(5) ]
      syn_centroid = [ random.randint(1,10000) for x in range(3) ]

      syn_segments = [ random.randint(1,1000) for x in range(4) ]
      syn_presegments = [ random.randint(1,1000) for x in range(3) ]
      syn_postsegments = [ random.randint(1,1000) for x in range(2) ]

      mdgrp.create_dataset ( "WEIGHT", (1,), np.float, data=syn_weight )
      mdgrp.create_dataset ( "SYNAPSE_TYPE", (1,), np.uint32, data=syn_synapse_type )
      mdgrp.create_dataset ( "CENTROID", (3,), np.uint32, data=syn_centroid )
      mdgrp.create_dataset ( "SEEDS", (len(syn_seeds),), np.uint32, data=syn_seeds )
      mdgrp.create_dataset ( "SEGMENTS", (len(syn_segments),), np.uint32, data=syn_segments)
      mdgrp.create_dataset ( "PRESEGMENTS", (len(syn_presegments),), np.uint32, data=syn_presegments)
      mdgrp.create_dataset ( "POSTSEGMENTS", (len(syn_postsegments),), np.uint32, data=syn_postsegments)

    # Seed
    elif annotype == 3:

      seed_parent = random.randint(1,1000)
      seed_position = [ random.randint(1,10000) for x in range(3) ]
      seed_cubelocation = random.randint(1,9)
      seed_source = random.randint(1,1000)

      mdgrp.create_dataset ( "PARENT", (1,), np.uint32, data=seed_parent )
      mdgrp.create_dataset ( "CUBE_LOCATION", (1,), np.uint32, data=seed_cubelocation )
      mdgrp.create_dataset ( "SOURCE", (1,), np.uint32, data=seed_source )    
      mdgrp.create_dataset ( "POSITION", (3,), np.uint32, data=seed_position )

    # Segment
    elif annotype == 4:
       
      seg_parentseed = random.randint(1,100000)
      seg_segmentclass = random.randint(1,9)
      seg_neuron = random.randint(1,100000)
      seg_synapses = [ random.randint(1,100000) for x in range(5) ]
      seg_organelles = [ random.randint(1,100000) for x in range(5) ]

      mdgrp.create_dataset ( "SEGMENTCLASS", (1,), np.uint32, data=seg_segmentclass )
      mdgrp.create_dataset ( "PARENTSEED", (1,), np.uint32, data=seg_parentseed )
      mdgrp.create_dataset ( "NEURON", (1,), np.uint32, data=seg_neuron )
      mdgrp.create_dataset ( "SYNAPSES", (len(seg_synapses),), np.uint32, seg_synapses )
      mdgrp.create_dataset ( "ORGANELLES", (len(seg_organelles),), np.uint32, seg_organelles )

    # Neuron
    elif annotype == 5:

      neuron_segments = [ random.randint(1,1000) for x in range(10) ]
      mdgrp.create_dataset ( "SEGMENTS", (len(neuron_segments),), np.uint32, neuron_segments )

    # Organelle
    elif annotype == 6:

      org_parentseed = random.randint(1,100000)
      org_organelleclass = random.randint(1,9)
      org_seeds = [ random.randint(1,100000) for x in range(5) ]
      org_centroid = [ random.randint(1,10000) for x in range(3) ]

      mdgrp.create_dataset ( "ORGANELLECLASS", (1,), np.uint32, data=org_organelleclass )
      mdgrp.create_dataset ( "PARENTSEED", (1,), np.uint32, data=org_parentseed )
      mdgrp.create_dataset ( "SEEDS", (len(org_seeds),), np.uint32, org_seeds )
      mdgrp.create_dataset ( "CENTROID", (3,), np.uint32, data=org_centroid )

    # Node
    elif annotype == 7:

      node_nodetype = random.randint(1,9)
      node_skeletonid = random.randint(1,9)
      node_parentid = random.randint(1,100000)
      node_location = [ random.randint(1,10000) for x in range(3) ]
      node_children = [ random.randint(1,10000) for x in range(5) ]
      node_diameter = random.random()

      mdgrp.create_dataset ( "NODETYPE", (1,), np.uint32, data=node_nodetype )
      mdgrp.create_dataset ( "SKELETONID", (1,), np.uint32, data=node_skeletonid )
      mdgrp.create_dataset ( "PARENTID", (1,), np.uint32, data=node_parentid )
      mdgrp.create_dataset ( "LOCATION", (3,), np.uint32, data=node_location )
      mdgrp.create_dataset ( "CHILDREN", (5,), np.uint32, data=node_children )
      mdgrp.create_dataset ( "DIAMETER", (1,), np.float, data=node_diameter )

    # Skeleton
    elif annotype == 8:

      skel_skeltype = random.randint(1,9)
      skel_rootnode = random.randint(1,100000)

      mdgrp.create_dataset ( "SKELETONTYPE", (1,), np.uint32, data=skel_skeltype )
      mdgrp.create_dataset ( "ROOTNODE", (1,), np.uint32, data=skel_rootnode )
Пример #50
0
    def write_dict_to_hdf5(self, data_dict, entry_point):
        """ Write a (nested) dictionary to HDF5 

        Args:
            data_dict (dict): Dicionary to be written
            entry_point (object): Object to write to
        """
        for key, item in data_dict.items():
            if isinstance(key, (float, int)):
                key = '__' + str(type(key)) + '__' + str(key)

            if isinstance(item, (str, bool, float, int)):
                entry_point.attrs[key] = item
            elif isinstance(item, np.ndarray):
                entry_point.create_dataset(key, data=item)
            elif isinstance(item, (np.int32, np.int64)):
                entry_point.attrs[key] = int(item)
            elif item is None:
                # as h5py does not support saving None as attribute
                # I create special string, note that this can create
                # unexpected behaviour if someone saves a string with this name
                entry_point.attrs[key] = 'NoneType:__None__'
            elif isinstance(item, dict):
                entry_point.create_group(key)
                self.write_dict_to_hdf5(data_dict=item,
                                        entry_point=entry_point[key])
            elif isinstance(item, tuple):
                self._write_list_group(key, item, entry_point, 'tuple')
            elif isinstance(item, list):
                if len(item) > 0:
                    elt_type = type(item[0])
                    if all(isinstance(x, elt_type) for x in item):
                        if isinstance(item[0],
                                      (int, float, np.int32, np.int64)):

                            entry_point.create_dataset(key,
                                                       data=np.array(item))
                            entry_point[key].attrs['list_type'] = 'array'
                        elif isinstance(item[0], str):
                            dt = h5py.special_dtype(vlen=str)
                            data = np.array(item)
                            data = data.reshape((-1, 1))
                            ds = entry_point.create_dataset(key,
                                                            (len(data), 1),
                                                            dtype=dt)
                            ds[:] = data
                        elif isinstance(item[0], dict):
                            entry_point.create_group(key)
                            group_attrs = entry_point[key].attrs
                            group_attrs['list_type'] = 'dict'
                            base_list_key = 'list_idx_{}'
                            group_attrs['base_list_key'] = base_list_key
                            group_attrs['list_length'] = len(item)
                            for i, list_item in enumerate(item):
                                list_item_grp = entry_point[key].create_group(
                                    base_list_key.format(i))
                                self.write_dict_to_hdf5(
                                    data_dict=list_item,
                                    entry_point=list_item_grp)
                        else:
                            logging.warning(
                                'List of type "{}" for "{}":"{}" not '
                                'supported, storing as string'.format(
                                    elt_type, key, item))
                            entry_point.attrs[key] = str(item)
                    else:
                        self._write_list_group(key, item, entry_point, 'list')
                else:
                    # as h5py does not support saving None as attribute
                    entry_point.attrs[key] = 'NoneType:__emptylist__'

            else:
                logging.warning('Type "{}" for "{}":"{}" not supported, '
                                'storing as string'.format(
                                    type(item), key, item))
                entry_point.attrs[key] = str(item)
Пример #51
0
 def storeClassLables(self, classLabels):
     # Create a dataset to store the actual class lavel names, then store the clas lael
     dt = h5py.special_dtype(vlen=unicode)
     labelSet = self.db.create_dataset("label_names", (len(classLabels), ),
                                       dtype=dt)
     labelSet[:] = classLabels
Пример #52
0
from .mockdata import write_file
from .mockdata.xgm import XGM
from .mockdata.gec_camera import GECCamera
from .mockdata.basler_camera import BaslerCamera as BaslerCam
from .mockdata.adc import ADC
from .mockdata.uvlamp import UVLamp
from .mockdata.motor import Motor
from .mockdata.tsens import TemperatureSensor
from .mockdata.imgfel import IMGFELCamera, IMGFELMotor
from .mockdata.gauge import Gauge
from .mockdata.dctrl import DCtrl
from .mockdata.mpod import MPOD
from .mockdata.detectors import AGIPDModule, LPDModule

vlen_bytes = h5py.special_dtype(vlen=bytes)


def make_metadata(h5file, data_sources, chunksize=16):
    N = len(data_sources)
    if N % chunksize:
        N += chunksize - (N % chunksize)

    root = [ds.split('/', 1)[0] for ds in data_sources]
    devices = [ds.split('/', 1)[1] for ds in data_sources]

    sources_ds = h5file.create_dataset('METADATA/dataSourceId', (N, ),
                                       dtype=vlen_bytes,
                                       maxshape=(None, ))
    sources_ds[:len(data_sources)] = data_sources
    root_ds = h5file.create_dataset('METADATA/root', (N, ),
def extract_flow_to_hdf5(model, example_list, cleanup_tmp_dirs=True):

    for example_idx, example in enumerate(example_list):

        # Parse dataset number from string 'video_000010' ==> int(10)
        dset_index = int(re.findall(r'\d+', example['dset_name'])[-1])

        print('Working on example {}/{}...'.format(example_idx + 1,
                                                   len(example_list)))
        tmp_image_dir = example['tmp_image_dir']
        num_image_files = len(cortex.utils.find_files(tmp_image_dir, 'jpg'))
        if num_image_files == 0:
            print('Skipping directory because no image files found: {}'.format(
                tmp_image_dir))
            continue

        # Intialize the data loader for this video
        inference_size = [-1, -1]  # largest possible
        dataset = ImagesFromFolderInference(tmp_image_dir,
                                            inference_size,
                                            extension='jpg')
        data_loader = DataLoader(dataset,
                                 batch_size=args.batch_size,
                                 num_workers=2,
                                 shuffle=False,
                                 pin_memory=True)
        print('Succesfully initialized dataloader with {} image pairs.'.format(
            len(dataset)))

        ######################################################################################
        ######################################################################################

        flow_minmax = []
        flow_images = []

        num_batches = int(np.ceil(len(dataset) / args.batch_size))

        for batch_idx, (data, target) in enumerate(data_loader):

            # Prepare inputs for forward pass
            if args.cuda:
                data, target = [d.cuda(async=True) for d in data
                                ], [t.cuda(async=True) for t in target]
            data, target = [Variable(d)
                            for d in data], [Variable(t) for t in target]

            # Actual forward pass through the network
            with torch.no_grad():
                # Shape = [N,2,H,W]
                output = model(data[0])

            # Saving the outputs
            for example_idx in range(output.shape[0]):
                flow_single = output[example_idx].data.cpu().numpy().transpose(
                    1, 2, 0)
                # Normalize and get 3-channel image
                flow_u_norm, min_u, max_u = cortex.vision.flow.normalize_flow(
                    flow_single[:, :, 0])
                flow_v_norm, min_v, max_v = cortex.vision.flow.normalize_flow(
                    flow_single[:, :, 1])
                flow_as_jpg = np.dstack(
                    (flow_u_norm, flow_v_norm, np.zeros_like(flow_u_norm)))
                flow_as_jpg = (flow_as_jpg * 255.0).astype(np.uint8)
                # Save results
                flow_minmax.append((min_u, max_u, min_v, max_v))
                flow_images.append(flow_as_jpg)

        # All batches are done, now write flow frames to HDF5 file
        with h5py.File(example['hdf5_file'], 'a') as hf:
            print('Writing results to HDF5 file...')
            # Create dataset for frames
            dt_frames = h5py.special_dtype(vlen=np.dtype('uint8'))
            dset_flow = hf.create_dataset("flow_{:06d}".format(dset_index),
                                          shape=(len(flow_images), ),
                                          dtype=dt_frames)
            for frame_idx, flow_image in enumerate(flow_images):
                # Apply JPG compression to the raw video frame
                encode_params = [int(cv2.IMWRITE_JPEG_QUALITY), 100]
                frame_jpg_encode = cv2.imencode(".jpg", flow_image,
                                                encode_params)
                frame_jpg_encode = frame_jpg_encode[1].tostring()
                dset_flow[frame_idx] = np.fromstring(frame_jpg_encode,
                                                     dtype='uint8')
            flow_minmax = np.asarray(flow_minmax, np.float64)
            dset_flow = hf.create_dataset(
                "flow_minmax_{:06d}".format(dset_index), data=flow_minmax)

        if cleanup_tmp_dirs:
            print('cleaning up temporary image directory: {}'.format(
                tmp_image_dir))
            shutil.rmtree(tmp_image_dir)

        print('#' * 60)
    print('Done.')
Пример #54
0
def write(filename,
          points,
          cells,
          point_data=None,
          cell_data=None,
          field_data=None,
          add_global_ids=True):
    '''Writes H5M files, cf.
    https://trac.mcs.anl.gov/projects/ITAPS/wiki/MOAB/h5m.
    '''
    import h5py

    point_data = {} if point_data is None else point_data
    cell_data = {} if cell_data is None else cell_data
    field_data = {} if field_data is None else field_data

    f = h5py.File(filename, 'w')

    tstt = f.create_group('tstt')

    # The base index for h5m is 1.
    global_id = 1

    # add nodes
    nodes = tstt.create_group('nodes')
    coords = nodes.create_dataset('coordinates', data=points)
    coords.attrs.create('start_id', global_id)
    global_id += len(points)

    # Global tags
    tstt_tags = tstt.create_group('tags')

    # The GLOBAL_ID associated with a point is used to identify points if
    # distributed across several processes. mbpart automatically adds them,
    # too.
    if 'GLOBAL_ID' not in point_data and add_global_ids:
        point_data['GLOBAL_ID'] = numpy.arange(
            1,
            len(points) + 1,
        )

    # add point data
    if point_data is not None:
        tags = nodes.create_group('tags')
        for key, data in point_data.items():
            if len(data.shape) == 1:
                dtype = data.dtype
                tags.create_dataset(key, data=data)
            else:
                # H5M doesn't accept n-x-k arrays as data; it wants an n-x-1
                # array with k-tuples as entries.
                n, k = data.shape
                dtype = numpy.dtype((data.dtype, (k, )))
                dset = tags.create_dataset(key, (n, ), dtype=dtype)
                dset[:] = data

            # Create entry in global tags
            g = tstt_tags.create_group(key)
            g['type'] = dtype
            # Add a class tag:
            # From
            # <http://lists.mcs.anl.gov/pipermail/moab-dev/2015/007104.html>:
            # ```
            # /* Was dense tag data in mesh database */
            #  define mhdf_DENSE_TYPE   2
            # /** \brief Was sparse tag data in mesh database */
            # #define mhdf_SPARSE_TYPE  1
            # /** \brief Was bit-field tag data in mesh database */
            # #define mhdf_BIT_TYPE     0
            # /** \brief Unused */
            # #define mhdf_MESH_TYPE    3
            #
            g.attrs['class'] = 2

    # add elements
    elements = tstt.create_group('elements')

    elem_dt = h5py.special_dtype(enum=('i', {
        'Edge': 1,
        'Tri': 2,
        'Quad': 3,
        'Polygon': 4,
        'Tet': 5,
        'Pyramid': 6,
        'Prism': 7,
        'Knife': 8,
        'Hex': 9,
        'Polyhedron': 10
    }))

    tstt['elemtypes'] = elem_dt

    tstt.create_dataset('history',
                        data=[
                            __name__.encode('utf-8'),
                            __about__.__version__.encode('utf-8'),
                            str(datetime.now()).encode('utf-8')
                        ])

    # number of nodes to h5m name, element type
    meshio_to_h5m_type = {
        'line': {
            'name': 'Edge2',
            'type': 1
        },
        'triangle': {
            'name': 'Tri3',
            'type': 2
        },
        'tetra': {
            'name': 'Tet4',
            'type': 5
        }
    }
    for key, data in cells.items():
        if key not in meshio_to_h5m_type:
            logging.warning('Unsupported H5M element type \'%s\'. Skipping.',
                            key)
            continue
        this_type = meshio_to_h5m_type[key]
        elem_group = elements.create_group(this_type['name'])
        elem_group.attrs.create('element_type',
                                this_type['type'],
                                dtype=elem_dt)
        # h5m node indices are 1-based
        conn = elem_group.create_dataset('connectivity', data=(data + 1))
        conn.attrs.create('start_id', global_id)
        global_id += len(data)

    # add cell data
    if cell_data:
        tags = elem_group.create_group('tags')
        for key, value in cell_data.items():
            tags.create_dataset(key, data=value)

    # add empty set -- MOAB wants this
    sets = tstt.create_group('sets')
    sets.create_group('tags')

    # set max_id
    tstt.attrs.create('max_id', global_id, dtype='u8')

    return
Пример #55
0
# rd_cad = comm.bcast(rd_cad, root=0)

# carma task arguments
nwalkers = 200
nsteps = 1000

bands = ['r']  # will loop over for each band in list
shape = (nwalkers, nsteps)
dtype = np.dtype([('LnPosterior', np.float64, shape),
                  ('Chain[0]', np.float64, shape),
                  ('Chain[1]', np.float64, shape),
                  ('rootChain[0]', np.complex128, shape),
                  ('rootChain[1]', np.complex128, shape)])

# hdf5 reference special data type
ref_dtype = h5py.special_dtype(ref=h5py.Reference)
dt = h5py.special_dtype(vlen=str)


def lsst_fit(lc, grp):
    """Take full mock LC and SDSS cadence to find best_fit params.

    Args:
        lc: Kali LC object, full mock LC.
        grp: HDF5 group storing the MCMC chains.
    """

    best_param = []  # store best-fit params
    ref_ls = []
    task = kali.carma.CARMATask(1,
                                0,
Пример #56
0
    def step(self, action):
        '''
        Convention says environment outputs np.arrays
        :param action: LongTensor(batch_size), or np.array(batch_sizelast discrete action chosen
        :return:
        '''
        try:  # in case action is a torch.Tensor
            action = action.cpu().to_numpy()
        except:
            pass

        self.actions.append(action[:, None])

        next_state = action
        if len(self.actions) < self._max_episode_steps:
            done = self.codec.is_padding(
                action)  # max index is padding, by convention
        else:
            done = np.ones_like(action) == 1

        reward = np.zeros_like(action, dtype=np.float)
        # for those sequences just computed, calculate the reward
        for i in range(len(action)):
            if self.done_rewards[i] is None and done[i]:
                this_action_seq = np.concatenate(self.actions,
                                                 axis=1)[i:(i + 1), :]
                this_char_seq = self.codec.actions_to_strings(
                    this_action_seq)  # codec expects a batch
                self.smiles[i] = this_char_seq[0]
                this_mol = Chem.MolFromSmiles(self.smiles[i])
                if this_mol is None:
                    print(self.smiles[i])
                    # rules = self.codec.grammar.GCFG.productions()
                    # for a in this_action_seq[0]:
                    #     print(rules[a])
                    self.valid[i] = 0
                else:
                    self.valid[i] = 1
                this_reward = self.reward_fun(this_char_seq)[0]
                self.done_rewards[i] = this_reward
                reward[i] = this_reward
                self.seq_len[i] = len(self.actions)

        #TODO: put the special string handling into the hdf5 wrapper
        import h5py
        dt = h5py.special_dtype(
            vlen=str)  # PY3 hdf5 datatype for variable-length Unicode strings
        if len(self.actions) == self._max_episode_steps:
            # dump the whole batch to disk
            append_data = {
                'smiles': np.array(self.smiles, dtype=dt),
                'actions': np.concatenate(self.actions, axis=1),
                'seq_len': self.seq_len
            }
            if self.save_dataset is not None:
                self.save_dataset.append(append_data)

        if False and not all(reward == reward):
            print('failure!')

        return next_state, reward, done, (self.smiles, self.valid)
Пример #57
0
 def writeVlen(self, data, key='data'):
     self.makedirs()
     with h5py.File(self.path) as f:
         dt = h5py.special_dtype(vlen=np.dtype(data[0].dtype))
         f.create_dataset(key, data=data, dtype=dt)
Пример #58
0
from ...utils import docval, getargs, popargs, call_docval_func
from ...data_utils import AbstractDataChunkIterator, get_shape
from ...build import Builder, GroupBuilder, DatasetBuilder, LinkBuilder, BuildManager,\
                     RegionBuilder, ReferenceBuilder, TypeMap
from ...spec import RefSpec, DtypeSpec, NamespaceCatalog, GroupSpec
from ...spec import NamespaceBuilder

from .h5_utils import H5ReferenceDataset, H5RegionDataset, H5TableDataset,\
                      H5DataIO, H5SpecReader, H5SpecWriter

from ..io import FORMIO

ROOT_NAME = 'root'
SPEC_LOC_ATTR = '.specloc'
H5_TEXT = special_dtype(vlen=text_type)
H5_BINARY = special_dtype(vlen=binary_type)
H5_REF = special_dtype(ref=Reference)
H5_REGREF = special_dtype(ref=RegionReference)


class HDF5IO(FORMIO):
    @docval({
        'name': 'path',
        'type': str,
        'doc': 'the path to the HDF5 file'
    }, {
        'name': 'manager',
        'type': BuildManager,
        'doc': 'the BuildManager to use for I/O',
        'default': None
def main(args):
  print('Loading image info from "%s"' % args.images_json)
  with open(args.images_json, 'r') as f:
    images = json.load(f)
  image_id_to_image = {i['image_id']: i for i in images}

  with open(args.splits_json, 'r') as f:
    splits = json.load(f)

  # Filter images for being too small
  splits = remove_small_images(args, image_id_to_image, splits)

  obj_aliases = load_aliases(args.object_aliases)
  rel_aliases = load_aliases(args.relationship_aliases)

  print('Loading objects from "%s"' % args.objects_json)
  with open(args.objects_json, 'r') as f:
    objects = json.load(f)

  # Vocab for objects and relationships
  vocab = {}
  train_ids = splits[args.train_split]
  create_object_vocab(args, train_ids, objects, obj_aliases, vocab)

  print('Loading attributes from "%s"' % args.attributes_json)
  with open(args.attributes_json, 'r') as f:
    attributes = json.load(f)

  # Vocab for attributes
  create_attribute_vocab(args, train_ids, attributes, vocab)

  object_id_to_obj = filter_objects(args, objects, obj_aliases, vocab, splits)
  print('After filtering there are %d object instances'
        % len(object_id_to_obj))

  print('Loading relationshps from "%s"' % args.relationships_json)
  with open(args.relationships_json, 'r') as f:
    relationships = json.load(f)

  create_rel_vocab(args, train_ids, relationships, object_id_to_obj,
                   rel_aliases, vocab)

  print('Encoding objects and relationships ...')
  numpy_arrays = encode_graphs(args, splits, objects, relationships, vocab,
                               object_id_to_obj, attributes)

  print('Writing HDF5 output files')
  for split_name, split_arrays in numpy_arrays.items():
    image_ids = list(split_arrays['image_ids'].astype(int))
    h5_path = os.path.join(args.output_h5_dir, '%s.h5' % split_name)
    print('Writing file "%s"' % h5_path)
    with h5py.File(h5_path, 'w') as h5_file:
      for name, ary in split_arrays.items():
        print('Creating datset: ', name, ary.shape, ary.dtype)
        h5_file.create_dataset(name, data=ary)
      print('Writing image paths')
      image_paths = get_image_paths(image_id_to_image, image_ids)
      path_dtype = h5py.special_dtype(vlen=str)
      path_shape = (len(image_paths),)
      path_dset = h5_file.create_dataset('image_paths', path_shape,
                                         dtype=path_dtype)
      for i, p in enumerate(image_paths):
        path_dset[i] = p
    print()

  print('Writing vocab to "%s"' % args.output_vocab_json)
  with open(args.output_vocab_json, 'w') as f:
    json.dump(vocab, f)
Пример #60
0
 def save_attr(group, col, scarf_col, md):
     d = md.fetch_all(scarf_col)
     h5[group].create_dataset(col,
                              data=d.astype(h5py.special_dtype(vlen=str)))