Exemplo n.º 1
0
def write_to_disk(data_dict: TimeSeriesDict, seg_start: int, f: File):
    """Write a TimeSeriesDict to a gwpy-compatible .hdf5 file. Supports appending to an existing file."""

    for name in data_dict:

        # deal with each TimeSeries in the TimeSeriesDict.
        data = data_dict[name]

        try:

            # create a gwpy-compatible h5py file.
            data.write(f, **writing_opts)

        except RuntimeError:  # the RuntimeError in regard here is caused by the dataset already existing.

            # use the h5py File driver to get a direct pointer to the existing dataset.
            dataset = f[name]

            # compute the time offset between the existing data and the new data.
            secs = seg_start - get_last_time(dataset)
            padding = secs / dataset.attrs['dx']
            # print(f'write: padding from {get_last_time(dataset)} to {seg_start} ({secs}s, {padding}pts)')

            if data.value.shape[0] < -padding:

                # this would resize the dataset to be smaller than it already is.
                raise RuntimeError('insertion is not supported.')

            else:

                # append data to the end of the file.
                dataset.resize(
                    (dataset.shape[0] + padding + data.value.shape[0]), axis=0)
                dataset[-data.value.shape[0]:] = data.value
                f.flush()  # sync table to disk
Exemplo n.º 2
0
def write_to_hdf5(path):
    """
    Take FITS file at ``path`` and save it into the HDF5 archive
    """
    kid = path.split('kplr')[1].split('-')[0]

    lc_data = fits.getdata(path)

    if os.path.exists(hdf5_path):
        h = File(hdf5_path, 'a')
    else:
        h = File(hdf5_path, 'w')

    attrs = header_to_dict(path)

    if kid in h:
        group = h[kid]
    else:
        group = h.create_group(kid)

    dataset_name = f"q{attrs['QUARTER']}"
    if dataset_name not in group:
        dset = group.create_dataset(dataset_name,
                                    data=np.vstack(
                                        [lc_data[col] for col in cols]))

        for k, v in attrs.items():
            dset.attrs[k] = v

    h.flush()
    h.close()
Exemplo n.º 3
0
def main():
    a = DetPulseCoord()
    fileid = h5f.create(b"test.h5")
    x = [1, 3, 3]
    y = [1., 3., 3, 4., 5, 3., 33.]
    x = ones((100, 3), dtype=int32)
    y = ones((100, 7), dtype=float32)
    z = ones((100, 2), dtype=float32)
    c = [(x[i], y[i], z[i]) for i in range(100)]
    data = {a.names[0]: x, a.names[1]: y}
    dspaceid = h5s.create_simple((1, ), (h5s.UNLIMITED, ))
    # dset = h5d.create(fileid, a.name, a.type, dspaceid)
    # dset.write()
    file = File("test.h5")
    numpytype = dtype([("coord", int32, (3, )), ("pulse", float32, (7, )),
                       ("EZ", float32, (2, ))])
    data = array(c, dtype=numpytype)
    tid = h5t.C_S1.copy()
    tid.set_size(6)
    H5T6 = Datatype(tid)
    tid.set_size(4)
    H5T_C_S1_4 = Datatype(tid)
    file.create_dataset("DetPulseCoord", data=data)
    file.attrs.create("CLASS", "TABLE", dtype=H5T6)
    file.attrs.create("FIELD_0_NAME", a.names[0])
    file.attrs.create("FIELD_1_NAME", a.names[1])
    file.attrs.create("TITLE", "Detpulse coord pair data")

    file.attrs.create("VERSION", "3.0", dtype=H5T_C_S1_4)
    file.attrs.create("abstime", 1.45e9, dtype=float64, shape=(1, ))
    file.attrs.create("nevents", 122421, dtype=float64, shape=(1, ))
    file.attrs.create("runtime", 125000, dtype=float64, shape=(1, ))
    file.flush()
Exemplo n.º 4
0
def train_one_case_generic_save_data(train_result: dict, key_this: str,
                                     f_out: h5py.File, y_test: np.ndarray,
                                     eval_fn):
    assert {'y_test_hat', 'corr'} <= train_result.keys() <= {
        'y_test_hat', 'corr', 'attrs', 'model'
    }
    # save
    y_test_hat = train_result['y_test_hat']
    assert np.all(np.isfinite(y_test_hat))
    assert y_test_hat.ndim == 2 and y_test_hat.shape[1] == 1
    assert y_test.shape == y_test_hat.shape
    grp_this = f_out.create_group(key_this)
    grp_this.create_dataset('y_test_hat', data=y_test_hat)
    assert np.isscalar(train_result['corr']) and np.isfinite(
        train_result['corr'])

    assert eval_fn(y_test_hat, y_test) == train_result['corr']

    grp_this.create_dataset('corr', data=train_result['corr'])
    print('performance', train_result['corr'])
    if 'attrs' in train_result:
        # save attrs
        for k, v in train_result['attrs'].items():
            grp_this.attrs[k] = v
    if 'model' in train_result:
        grp_this_model = grp_this.create_group('model')
        if isinstance(train_result['model'], dict):
            for k_model, v_model in train_result['model'].items():
                grp_this_model.create_dataset(k_model, data=v_model)
        else:
            # for Gabor.
            assert callable(train_result['model'])
            train_result['model'](grp_this_model)

    f_out.flush()
 def analyzeExperiment(self, experimentResults):
     # write to data file
     super(Origin, self).toHDF5(experimentResults[self.settings])
     # and to settings file
     try:
         f = File('settings.hdf5', 'a')
         super(Origin, self).toHDF5(f['settings/experiment'])
         f.flush()  # write changes
     except Exception as e:
         logger.exception('Uncaught Exception in origin.postExperiment.')
     finally:
         f.close()  # close the file
     return 0
Exemplo n.º 6
0
def handle_one_case_inner(neural_dataset_key, subset, has_val, train_percentage,
                          seed, f_in_idx: h5py.File, features_all: np.ndarray,
                          f_out: h5py.File, transformer: CNNPreprocessor):
    dataset_main_name = split_dataset_name_gen(neural_dataset_key, subset, has_val, train_percentage, seed)

    print(f'handle {dataset_main_name}')

    sets_to_handle = ('train', 'val', 'test') if has_val else ('train', 'test')

    for set_to_handle_this in sets_to_handle:
        data_to_save = dataset_main_name + f'/{set_to_handle_this}/X'
        if data_to_save not in f_out:
            set_original_idx = f_in_idx[dataset_main_name][set_to_handle_this].attrs['index']
            assert np.array_equal(np.unique(set_original_idx), set_original_idx)
            set_original = features_all[set_original_idx]
            set_transformed = transformer.transform(set_original)
            # then save
            f_out.create_dataset(data_to_save, data=set_transformed)
            f_out.flush()
            print(f'{set_to_handle_this} done')
        else:
            print(f'{set_to_handle_this} done before')
def handle_one_case_inner(neural_dataset_key, subset, has_val,
                          train_percentage, seed, f_in: h5py.File,
                          f_out: h5py.File, transformer: GLMDataPreprocesser):
    dataset_main_name = split_dataset_name_gen(neural_dataset_key, subset,
                                               has_val, train_percentage, seed)

    print(f'handle {dataset_main_name}')

    sets_to_handle = ('train', 'val', 'test') if has_val else ('train', 'test')

    for set_to_handle_this in sets_to_handle:
        data_to_save = dataset_main_name + f'/{set_to_handle_this}/X'
        if data_to_save not in f_out:
            set_original = f_in[dataset_main_name][f'{set_to_handle_this}/X'][
                ...]
            set_transformed = transformer.transform(set_original)
            # then save
            f_out.create_dataset(data_to_save, data=set_transformed)
            f_out.flush()
            print(f'{set_to_handle_this} done')
        else:
            print(f'{set_to_handle_this} done before')
Exemplo n.º 8
0
def init_file(f: h5py.File, groupname: str = 'data') -> None:

    if groupname in f:
        del f[groupname]
        f.flush()
        grp = f.create_group(groupname)
        add_cur_time_attr(grp)
        f.flush()
    else:
        grp = f.create_group(groupname)
        add_cur_time_attr(grp)
        f.flush()
Exemplo n.º 9
0
class Series(object):
    """
    Time Series of fMRI images, store all pixels's data in a hdf5 file.
    """
    def __new__(cls, hdf5_path, *args, **kwargs):
        """
        If the hdf5 file exist, will load it,
        else will create a new hdf5 file, in this situation,
        need these arguments:
            :image_dir: path to directory which store hdr image files.
            NOTE: Images name's character order must same to time sequence order.
            :time_interval: time interval between two images, unit: 1 second
        """
        if not exists(hdf5_path):
            image_dir = kwargs['image_dir']
            time_interval = kwargs['time_interval']
            cls.create_from_hdr(image_dir, hdf5_path, time_interval)
        return super(Series, cls).__new__(cls)

    def __init__(self, hdf5_path, cachedir=CACHE, *args, **kwargs):
        """
        Load Series from hdf5 file.

        :hdf5_path: path to related hdf5 file.
        :cachedir: path to cache directory, default current dir.
        """
        self.h5dict = File(hdf5_path, 'r+')
        for k, v in self.h5dict.attrs.items():
            setattr(self, k, v)

        self.cachedir = cachedir

    def save_attr(self):
        """
        save self's attributes:

        * break_points
        * simu_intervals

        to self.h5dict.attrs
        """
        for attr in ('break_points', 'simu_intervals'):
            if hasattr(self, attr):
                self.h5dict.attrs[attr] = getattr(self, attr)
        self.h5dict.flush()

    def _memoize(self, func, verbose=0):
        '''
        helper method for memory cache.
        '''
        if not hasattr(self, '_mymem'):
            self._mymem = joblib.Memory(cachedir=self.cachedir)

        memoized_func = self._mymem.cache(func, verbose=verbose)
        memoized_func.__doc__ = func.__doc__

        return memoized_func

    def _get_series(self, x, y, z):
        """
        return the time series(numpy array) at the position (z, y, x)
        """
        if hasattr(self, 'start') and hasattr(self, 'end'):
            s, e = self.start, self.end
            times = self.h5dict['arr4d'][s:e, y, x, z]
        else:
            times = self.h5dict['arr4d'][:, y, x, z]
        return times

    def get_series(self, *args, **kwargs):
        """ cached method, cache mothod at first run """
        self.get_series = self._memoize(self._get_series)
        return self.get_series(*args, **kwargs)

    def _get_arr3d(self, t):
        """
        return the 3d(y, x, z) array at the time point t.
        """
        arr3d = self.h5dict['arr4d'][t, :, :, :] # (t, y, x, z)
        return arr3d

    def get_arr3d(self, *args, **kwargs):
        """ cached method, cache mothod at first run """
        self.get_arr3d = self._memoize(self._get_arr3d)
        return self.get_arr3d(*args, **kwargs)

    def _get_arr2d(self, t, k, axis='xy'):
        """
        return 2d array at the time point t.

        :t: (int) time point of 2d array
        :k: (int) index of another dimension, e.g. axis == 'xy', k will means index of 'z' axis
        :axis: (str) the axis of 2d array. like: 'xy'(default), 'yz', 'xz'
        """
        arr3d = self.get_arr3d(t) # (t, y, x, z)
        assert axis in ('xy', 'yz', 'xz')
        if axis == 'xy':
            # k -> z
            arr2d = arr3d[:, :, k]
        elif axis == 'yz':
            # k -> x
            arr2d = arr3d[:, k, :]
        else: # 'xz'
            # k -> y
            arr2d = arr3d[k, :, :]
        return arr2d

    def get_arr2d(self, *args, **kwargs):
        """ cached method, cache mothod at first run """
        self.get_arr2d = self._memoize(self._get_arr2d)
        return self.get_arr2d(*args, **kwargs)

    def set_break_points(self, time_interval):
        """
        set break points(the image index number when event occur)
        :time_interval: (tuple) time interval when event occur,
            like: (100, 110)
        """
        start, end = time_interval
        assert start >= 0 and end <= self.n_images
        assert start < end
        self.break_points = (start, end)

    def set_range(self, start, end):
        """
        Set start and end position, for take subset of series.
        NOTE: after this method run, the behavior of `get_series` will change.
              `get_serirs` will return time_serirs[start:end]
        :start: (int) start position of time series
        :end: (int) end position of time series
        """
        msg = "series range seted (%d, %d),"%(start, end) +\
              " `get_series`'s behavior will change"
        log.warning(msg)
        self._mymem.clear()
        log.info("memory cache clear")
        self.start = start
        self.end = end

    def set_simu_intervals(self, intervals):
        """
        Set the time intervals when simulation
        :intervals: (list) a list of intervals. like: [(0, 10), (30, 50), (100, 110)]
        """
        # check intervals
        for start, end in intervals:
            assert start >= 0
            assert end <= self.n_images - 1
            assert start < end
        self.simu_intervals = intervals        

    def call_simu(self, algorithm, name, *args, **kwargs):
        """
        Call simulation region, store result in the dict: self.simu_results

        :algorithm: the name of simulation calling method
        :name: (str) the name of this result
        """
        log.info("call simulation region using {} algorithm".format(algorithm))
        calling = importlib.import_module('simucaller.call_simu')
        if not hasattr(self, 'simu_results'):
            self.simu_results = {}
        alg = getattr(calling, algorithm)
        result = alg(self, *args, **kwargs)
        self.simu_results.setdefault(algorithm, {})
        self.simu_results[algorithm][name] = result

    def list_simu_result(self):
        """
        list all simulation region call result.
        """
        res_list = [
            "%s/%s"%(alg_name, name)
                for alg_name, alg_group in self.h5dict['simulation_region_call'].items()
                    for name, _ in alg_group.items()
        ]
        return res_list

    def save_simu_result(self, algorithm, name):
        """
        Save simulation region call result to related hdf5 file.

        :algorithm: (str) name of algorithm
        :name: (name) the name of result dataset

        save path:
            self.h5dict -> simulation_region_call/<algorithm>/<name>
        """
        path = "simulation_region_call/{}/{}".format(algorithm, name)
        result = self.simu_results[algorithm][name]
        log.info("saving simulation call result to path: {}".format(path))
        self.h5dict.create_dataset(path, shape=result.shape)
        log.debug(result.shape)
        log.debug(type(result))
        self.h5dict[path][...] = result
        self.h5dict.flush()

    def get_simu_result(self, algorithm, name):
        """
        Load simulation region call result from hdf5 file.

        :algorithm: result's calling method.
        :name: (str) result dataset name.
        """
        result = self.h5dict['simulation_region_call'][algorithm][name][...]
        return result

    @classmethod
    def create_from_hdr(cls, image_dir, hdf5_path, time_interval):
        """
        Create hdf5 file from hdr images.
        NOTE: Images name's character order must same to time sequence order.

        :time_interval: time interval between two images, unit: 1 second
        """
        img_files = [i for i in listdir(image_dir) if i.endswith('.hdr')]
        img_files.sort(key=lambda i: i.split('.')[0])
        img_files = [join(image_dir, i) for i in img_files]

        load_img = lambda f: nib.load(f).get_data()
        log.info("loading hdr images ...")
        imgs = [load_img(i) for i in img_files]
        n_images = len(imgs)
        log.info("{} hdr images loaded.".format(n_images))

        # check images shape, all images must in sam shape
        shape = y, x, z = imgs[0].shape
        log.info("image shape: {}".format(shape))
        for i, img in enumerate(imgs):
            assert img.shape == shape, \
                "Image {} expect in shape {} but get shape {}".format(
                    img_files[i], img.shape, shape
                )

        arr4d = np.array(imgs) # shape: (t, y, x, z)
        log.debug(arr4d[arr4d != 0])

        # store data
        h5dict = File(hdf5_path, 'w')
        log.info("hdf5 file created at {}".format(hdf5_path))
        h5dict.create_dataset('arr4d', shape=arr4d.shape)
        h5dict['arr4d'][...] = arr4d
        log.info("time series dataset shape {}".format(arr4d.shape))

        # store meta data
        h5dict.attrs['n_images'] = n_images
        h5dict.attrs['shape'] = arr4d.shape
        h5dict.attrs['time_interval'] = float(time_interval)
        log.info("time interval: {}s".format(time_interval))
        h5dict.close() # close hdf5 file
        log.info("Series hdf5 file creating process finished")

    def __del__(self):
        self.h5dict.close()
Exemplo n.º 10
0
 def test_flush(self):
     """ Flush via .flush method """
     fid = File(self.mktemp(), 'w')
     fid.flush()
     fid.close()
Exemplo n.º 11
0
def write_data_to_file(datadict: DataDict,
                       f: h5py.File,
                       groupname: str = 'data',
                       append_mode: AppendMode = AppendMode.new,
                       swmr_mode: bool = True) -> None:

    if groupname not in f:
        raise RuntimeError('Group does not exist, initialize file first.')
    grp = f[groupname]

    # if we want to use swmr, we need to make sure that we're not
    # creating any more objects (see hdf5 docs).
    allexist = True
    for k, v in datadict.data_items():
        if k not in grp:
            allexist = False

    # add top-level meta data.
    for k, v in datadict.meta_items(clean_keys=False):
        set_attr(grp, k, v)

    f.flush()
    if allexist and swmr_mode and not f.swmr_mode:
        f.swmr_mode = True

    for k, v in datadict.data_items():
        data = v['values']
        shp = data.shape
        nrows = shp[0]

        # create new dataset, add axes and unit metadata
        if k not in grp:
            maxshp = tuple([None] + list(shp[1:]))
            ds = grp.create_dataset(k, maxshape=maxshp, data=data)

            # add meta data
            add_cur_time_attr(ds)

            if v.get('axes', []) != []:
                set_attr(ds, 'axes', v['axes'])
            if v.get('unit', "") != "":
                set_attr(ds, 'unit', v['unit'])

            for kk, vv in datadict.meta_items(k, clean_keys=False):
                set_attr(ds, kk, vv)

            ds.flush()

        # if the dataset already exits, append data according to
        # chosen append mode.
        else:
            ds = grp[k]
            dslen = ds.shape[0]

            if append_mode == AppendMode.new:
                newshp = tuple([nrows] + list(shp[1:]))
                ds.resize(newshp)
                ds[dslen:] = data[dslen:]
            elif append_mode == AppendMode.all:
                newshp = tuple([dslen + nrows] + list(shp[1:]))
                ds.resize(newshp)
                ds[dslen:] = data[:]

            ds.flush()
    f.flush()
Exemplo n.º 12
0
def make_nuc(ncc_file_path, n3d_file_path, out_file_name):
  
  if not out_file_name.lower().endswith('.nuc'):
    out_file_name = out_file_name + '.nuc'
  
  contact_dict = import_contacts(ncc_file_path)
  
  contact_name = os.path.splitext(os.path.basename(ncc_file_path))[0]
  
  pos_dict, coords_dict = import_coords(n3d_file_path)
  
  root = File(out_file_name, mode='w')
        
  hierarchy = (('contacts',   ('original', 'working')),
                ('display',    ()),
                ('chromosomes',()),
                ('dataTracks', ('derived', 'external', 'innate')),
                ('sample',     ('protocol', 'organism', 'tissue')),
                ('structures', ('0')),
                ('images',     ())
                )
   
  for parent, children in hierarchy:
    group = root.create_group(parent)
  
    for child in children:
      group.create_group(child)
  
  for child in ('particles', 'restraints', 'transforms', 'coords'):
    root['structures']['0'].create_group(child)
  
  now = int(time.time())
  random.seed(now)        
  
  root.attrs['id'] = np.array([random.random(), now, now], np.float32)
  
  root['sample'].attrs['name'] = np.string_('Unknown')  
  
  contact_group = root['contacts']['working'].create_group(contact_name)
  
  for chromoPair in contact_dict:
    chrA, chrB = chromoPair
    
    if chrA not in contact_group:
      contact_group.create_group(chrA)

    contact_group[chrA].create_dataset(chrB, dtype=np.uint32, data=contact_dict[chromoPair].T)
    
  coords_group   = root['structures']['0']['coords']
  particle_group = root['structures']['0']['particles']
 
  
  for chromo in coords_dict:
    coords_group.create_dataset(chromo, dtype=np.float64, data=coords_dict[chromo])
    
    pos = np.array(pos_dict[chromo], np.uint32)
    group = particle_group.create_group(chromo)
    group.create_dataset('positions', dtype=np.uint32, data=pos)
    
    chromo_group = root['chromosomes'].create_group(chromo)
    chromo_group.attrs['limits'] = np.array([pos.min(), pos.max()])
    
  root.flush()
Exemplo n.º 13
0
class labelManager(object):

    def __init__(self, fileName, startBlockNum = 0):
        self._f = File(fileName,'r+')
        self._blockNumber = startBlockNum
        self._maxLabelNum = 9999

    def addBlockLabel(self, data, start, stop=None, invert = False):
        if not stop:
            stop = [length + offset for length, offset in zip(data.shape, start)]

        if self._blockNumber <= self._maxLabelNum:
            dataset = self._f['PixelClassification/LabelSets/labels000'].create_dataset('block%04d' % self._blockNumber, data=(data.astype(np.uint8)))
            dataset.attrs.create('blockSlice',pointsToPosition(start, stop, invert))
            self._blockNumber += 1
        else:
            print 'Warning: maximum label block number exceeded. Unable to add further labels.'


    def addMultipleSingleLabels(self, positions, labelValue):
        for point in positions.T:
            self.addLabels(labelValue, pointsToPosition(point, point+1))

    def addSingleLabel(self, labelValue, position):
        dataset = self._f['PixelClassification/LabelSets/labels000'].create_dataset('block%04d' % self._blockNumber, data=[[[[np.uint8(labelValue)]]]])
        dataset.attrs.create('blockSlice',position)
        self._blockNumber += 1

    def clear(self):
        dataset = self._f['PixelClassification/LabelSets/labels000']
        for key in dataset.keys():
            del dataset[key]
        self._blockNumber = 0

    def getSubBlocks(self):
        """ returns subblocks containing the labels together with their corresponding offsets"""

        dataset = self._f['PixelClassification/LabelSets/labels000']
        labelBlocks = []
        for key in dataset:
            offset = strToPos(dataset[key].attrs.get('blockSlice'))
            values = dataset[key].value
            labelBlocks.append([offset, values])
            print key
        return labelBlocks

    def getInSingleBlock(self, shape=None):
        """ returns a block containing all the labels. The return is guaranteed to start at (0,0,0) global coordinates,
        it may however not cover the whole block (max(shape[0]), max(shape[1]), max(shape[2])), since there is no good way
        of determining the shape of the raw data from ilasti"""


        # get the labels as they are saved in the projecct
        labeledBlocks = self.getSubBlocks()

        offsets = np.array([labeledBlock[0] for labeledBlock in labeledBlocks])
        shapes = np.array([labeledBlock[1].shape[:3] for labeledBlock in labeledBlocks])
        data = [labelsBlock[1][:,:,:,0] for labelsBlock in labeledBlocks]

        if shape is None:
            # find out the dimension of the block, there should be a better way of doing that.
            shape = np.max(offsets + shapes[:,:3], axis=0)

        # write all labeles into one big array
        labelBlockTotal = np.zeros(shape, dtype=np.uint8)
        for offset, shape, dataBlock in zip(offsets, shapes, data):
            index = [slice(offset[0], offset[0] + shape[0]),
                    slice(offset[1], offset[1] + shape[1]),
                    slice(offset[2], offset[2] + shape[2])]
            labelBlockTotal[index] += dataBlock

        return labelBlockTotal


    def flush(self):
        self._f.flush()

    def changeRawDataPath(self, newPath):
        """ deletes all saved paths and replaces it with the path 'newPath' """
        dataset = self._f['Input Data/infos/lane0000/Raw Data/']
        dataset.pop('filePath')
        dataset.create_dataset('filePath', data=newPath)
Exemplo n.º 14
0
    'w')

for key in hinput.keys():
    hinput.copy('/' + key, houput['/'], name=key)
    if houput[key].ndim == 2:
        houput[key + '_c'] = houput[key][0:76, 181:183]
    elif houput[key].ndim == 3:
        houput[key + '_c'] = houput[key][0:76, 181:183, :]
    elif houput[key].ndim == 1:
        houput[key + '_c'] = houput[key][181:183]

    houput[key + '_c'].attrs.update(houput[key].attrs)
    del houput[key]
    houput[key] = houput[key + '_c']
    del houput[key + '_c']

    print houput[key]

houput.attrs.update(hinput.attrs)

houput.flush()

print hinput['/']
print houput['/']

print[attr for attr in hinput.attrs]
print[attr for attr in houput.attrs]

hinput.close()
houput.close()
Exemplo n.º 15
0
class MovieDB(object):
    def __init__(self, name):
        super(MovieDB, self).__init__()
        self.name = name
        path = '/media/qwertyflagstop/data/{}.h5'.format(self.name)
        self.file = File(path, mode='a')

    def download_songs_from_list_in_file(self, file_name, num_workers=10):
        """

            :param file_name: the name of file with IMDB ids
            :return: nothing, it downloads stuff
        """
        global ids
        global worker_count
        global index
        global tt_index
        with open(file_name, 'r') as fp:
            ids = set(json.load(fp))
        tt_index = 75000
        for k in self.file.keys():  #remove any we already got
            tt_index = max(tt_index, int(k[2:]))
            if k in ids:
                ids.remove(k)
        ids = list(ids)
        worker_count = num_workers
        index = 0
        queue = Queue()
        lock = Lock()
        for j in np.arange(0, num_workers):
            t = Thread(target=fetch_poster, args=[queue, lock, j])
            t.daemon = True
            t.start()
        while worker_count > 0:
            try:
                print('got {} movies'.format(len(self.file.keys())))
                s = queue.get(timeout=4)
                poster_bytes = np.array(s['poster'])
                plot = np.string_(s['plot'])
                self.file.create_dataset('{}/poster'.format(s['id']),
                                         data=poster_bytes)
                self.file.create_dataset('{}/plot'.format(s['id']), data=plot)
                self.file.flush()
            except:
                continue
        print('DONE!')
        self.file.flush()
        self.file.close()

    def view_random__images(self):
        ids = self.file.keys()
        lengths = []
        chars = set()
        import string
        master_txt = open('plots.txt', 'w')
        for i in np.arange(0, len(ids)):
            movie_facts = np.array(self.file['{}/{}'.format(
                ids[i], 'plot')]).tostring()
            movie_facts = json.loads(movie_facts)
            l = movie_facts['Plot']
            p = ''.join([x for x in l if x in string.printable])
            p = p.replace('\n', ' ')
            master_txt.write(p)
            master_txt.write('\n')
            # image_bytes = np.array(self.file['{}/{}'.format(ids[j],'poster')])
            # v = Image.open(BytesIO(image_bytes.tostring()))
            # v.save('{}_.jpg'.format(i))
        master_txt.close()
Exemplo n.º 16
0
def close_file(file: h5py.File):
    file.flush()
    file.close()
Exemplo n.º 17
0
def convert_cifar10(directory, output_directory,
                    output_filename='cifar10.hdf5'):
    """Converts the CIFAR-10 dataset to HDF5.
    Converts the CIFAR-10 dataset to an HDF5 dataset compatible with
    :class:`fuel.datasets.CIFAR10`. The converted dataset is saved as
    'cifar10.hdf5'.
    It assumes the existence of the following file:
    * `cifar-10-python.tar.gz`
    Parameters
    ----------
    directory : str
        Directory in which input files reside.
    output_directory : str
        Directory in which to save the converted dataset.
    output_filename : str, optional
        Name of the saved dataset. Defaults to 'cifar10.hdf5'.
    Returns
    -------
    output_paths : tuple of str
        Single-element tuple containing the path to the converted dataset.
    """
    output_path = os.path.join(output_directory, output_filename)
    h5file = File(output_path, mode='w')
    input_file = os.path.join(directory, DISTRIBUTION_FILE)
    tar_file = tarfile.open(input_file, 'r:gz')

    train_batches = []
    for batch in range(1, 6):
        file = tar_file.extractfile(
            'cifar-10-batches-py/data_batch_%d' % batch)
        try:
            if six.PY3:
                array = cPickle.load(file, encoding='latin1')
            else:
                array = cPickle.load(file)
            train_batches.append(array)
        finally:
            file.close()

    train_features = numpy.concatenate(
        [batch['data'].reshape(batch['data'].shape[0], 3, 32, 32)
            for batch in train_batches])
    train_labels = numpy.concatenate(
        [numpy.array(batch['labels'], dtype=numpy.uint8)
            for batch in train_batches])
    train_labels = numpy.expand_dims(train_labels, 1)

    print train_features.shape
    print train_labels.shape

    flipped_train_features = train_features[:,:,:,::-1]

    train_features = numpy.array([val for pair in zip(train_features, flipped_train_features) for val in pair])
    train_labels = numpy.repeat(train_labels, 2, axis=0)

    print train_features.shape
    print train_labels.shape

    file = tar_file.extractfile('cifar-10-batches-py/test_batch')
    try:
        if six.PY3:
            test = cPickle.load(file, encoding='latin1')
        else:
            test = cPickle.load(file)
    finally:
        file.close()

    test_features = test['data'].reshape(test['data'].shape[0],
                                         3, 32, 32)
    test_labels = numpy.array(test['labels'], dtype=numpy.uint8)
    test_labels = numpy.expand_dims(test_labels, 1)

    data = (('train', 'features', train_features),
            ('train', 'targets', train_labels),
            ('test', 'features', test_features),
            ('test', 'targets', test_labels))
    fill_hdf5_file(h5file, data)
    h5file['features'].dims[0].label = 'batch'
    h5file['features'].dims[1].label = 'channel'
    h5file['features'].dims[2].label = 'height'
    h5file['features'].dims[3].label = 'width'
    h5file['targets'].dims[0].label = 'batch'
    h5file['targets'].dims[1].label = 'index'

    h5file.flush()
    h5file.close()

    return (output_path,)
Exemplo n.º 18
0
 def test_flush(self):
     """ Flush via .flush method """
     fid = File(self.mktemp(), 'w')
     fid.flush()
     fid.close()
Exemplo n.º 19
0
class labelManager(object):
    def __init__(self, fileName, startBlockNum=0):
        self._f = File(fileName, 'r+')
        self._blockNumber = startBlockNum
        self._maxLabelNum = 9999

    def addBlockLabel(self, data, start, stop=None, invert=False):
        if not stop:
            stop = [
                length + offset for length, offset in zip(data.shape, start)
            ]

        if self._blockNumber <= self._maxLabelNum:
            dataset = self._f[
                'PixelClassification/LabelSets/labels000'].create_dataset(
                    'block%04d' % self._blockNumber,
                    data=(data.astype(np.uint8)))
            dataset.attrs.create('blockSlice',
                                 pointsToPosition(start, stop, invert))
            self._blockNumber += 1
        else:
            print 'Warning: maximum label block number exceeded. Unable to add further labels.'

    def addMultipleSingleLabels(self, positions, labelValue):
        for point in positions.T:
            self.addLabels(labelValue, pointsToPosition(point, point + 1))

    def addSingleLabel(self, labelValue, position):
        dataset = self._f[
            'PixelClassification/LabelSets/labels000'].create_dataset(
                'block%04d' % self._blockNumber,
                data=[[[[np.uint8(labelValue)]]]])
        dataset.attrs.create('blockSlice', position)
        self._blockNumber += 1

    def clear(self):
        dataset = self._f['PixelClassification/LabelSets/labels000']
        for key in dataset.keys():
            del dataset[key]
        self._blockNumber = 0

    def getSubBlocks(self):
        """ returns subblocks containing the labels together with their corresponding offsets"""

        dataset = self._f['PixelClassification/LabelSets/labels000']
        labelBlocks = []
        for key in dataset:
            offset = strToPos(dataset[key].attrs.get('blockSlice'))
            values = dataset[key].value
            labelBlocks.append([offset, values])
            print key
        return labelBlocks

    def getInSingleBlock(self, shape=None):
        """ returns a block containing all the labels. The return is guaranteed to start at (0,0,0) global coordinates,
        it may however not cover the whole block (max(shape[0]), max(shape[1]), max(shape[2])), since there is no good way
        of determining the shape of the raw data from ilasti"""

        # get the labels as they are saved in the projecct
        labeledBlocks = self.getSubBlocks()

        offsets = np.array([labeledBlock[0] for labeledBlock in labeledBlocks])
        shapes = np.array(
            [labeledBlock[1].shape[:3] for labeledBlock in labeledBlocks])
        data = [labelsBlock[1][:, :, :, 0] for labelsBlock in labeledBlocks]

        if shape is None:
            # find out the dimension of the block, there should be a better way of doing that.
            shape = np.max(offsets + shapes[:, :3], axis=0)

        # write all labeles into one big array
        labelBlockTotal = np.zeros(shape, dtype=np.uint8)
        for offset, shape, dataBlock in zip(offsets, shapes, data):
            index = [
                slice(offset[0], offset[0] + shape[0]),
                slice(offset[1], offset[1] + shape[1]),
                slice(offset[2], offset[2] + shape[2])
            ]
            labelBlockTotal[index] += dataBlock

        return labelBlockTotal

    def flush(self):
        self._f.flush()

    def changeRawDataPath(self, newPath):
        """ deletes all saved paths and replaces it with the path 'newPath' """
        dataset = self._f['Input Data/infos/lane0000/Raw Data/']
        dataset.pop('filePath')
        dataset.create_dataset('filePath', data=newPath)