Exemplo n.º 1
0
def store_all_data(combined_data, cancer_type_list, output_filename):

    combined_data = combined_data.astype(str)

    with h5py.File(output_filename, "w") as out_f:

        dset = out_f.create_dataset("data",
                                    combined_data.shape,
                                    dtype=h5py.string_dtype('utf-8'))
        dset[:, :] = combined_data.values

        rowset = out_f.create_dataset("index",
                                      combined_data.index.shape,
                                      dtype=h5py.string_dtype('utf-8'))
        rowset[:] = combined_data.index.values

        colset = out_f.create_dataset("columns",
                                      combined_data.columns.shape,
                                      dtype=h5py.string_dtype('utf-8'))
        colset[:] = combined_data.columns.values

        ctype_set = out_f.create_dataset("cancer_types",
                                         (len(cancer_type_list), ),
                                         dtype=h5py.string_dtype('utf-8'))
        ctype_set[:] = cancer_type_list

    return
Exemplo n.º 2
0
    def init_log(self, maxlog: int) -> None:
        """Init logging interface to hdf5 file.

        @param maxlog: (initial) maximum log line to reserve in hdf5 file
            if more space is needed, maxlog more lines will be reserved.
            unused lines will be removed at run end.
        @type maxlog: int

        """
        self.maxlog = maxlog
        self.dlog = maxlog
        size = MPI_STATUS.size
        self.logging = self.h5file.create_group("Logging")
        self.logcount = self.logging.create_dataset("count", (size, ),
                                                    fillvalue=0,
                                                    dtype="int32")
        self.logs = self.logging.create_dataset(
            "logs",
            (size, maxlog),
            maxshape=(size, None),
            dtype=[
                ("level", "int32"),
                ("time", string_dtype(length=18)),
                ("runtime", "float32"),
                ("message", string_dtype(length=self.maxstrlen)),
            ],
        )
        MPI_GATE.register_function("addlog", self.add_log_line)
        self._init_log = True
def write_hdf(dataframe, ctype_ls, barcode_ls, filename):

    with h5py.File(filename, "w") as f:

        dset = f.create_dataset("data", dataframe.shape, dtype=float)
        dset[:, :] = dataframe.values

        columns = f.create_dataset("columns",
                                   dataframe.columns.shape,
                                   dtype=h5py.string_dtype('utf-8'))
        columns[:] = dataframe.columns.values

        idx = f.create_dataset("index",
                               dataframe.index.shape,
                               dtype=h5py.string_dtype('utf-8'))
        idx[:] = dataframe.index.values

        ctypes = f.create_dataset("cancer_types",
                                  len(ctype_ls),
                                  dtype=h5py.string_dtype('utf-8'))
        ctypes[:] = ctype_ls

        barcodes = f.create_dataset("barcodes",
                                    len(barcode_ls),
                                    dtype=h5py.string_dtype('utf-8'))
        barcodes[:] = barcode_ls

    return
Exemplo n.º 4
0
def writeHDF5(objCollectionExt, extMagData, hdfOutputFileName):
    f = h5py.File(hdfOutputFileName, 'a')

    grpTables = f.create_group('Tables')

    # create object index table
    nObjects = objCollectionExt.nObj
    nFilters = objCollectionExt.nFilters
    
    objTable = np.empty((nObjects), dtype=[('objName',h5py.string_dtype()), ('indexLo', 'i4'), ('indexHi', 'i4')])
    for (i, objName) in enumerate(objCollectionExt.objNames):
        objTable[i]['objName'] = objName
        objTable[i]['indexLo'] = objCollectionExt.objSynSlice[objName].start
        objTable[i]['indexHi'] = objCollectionExt.objSynSlice[objName].stop

    print(objTable)
    grpTables.create_dataset('objectTable', data=objTable)

    # create variable index table
    varTable = np.empty((4 + nFilters), dtype=[('varName',h5py.string_dtype()), ('index', 'i4')])
    varTable[0] = ('Teff', objCollectionExt.offsetTeff)
    varTable[1] = ('logg', objCollectionExt.offsetLogg)
    varTable[2] = ('Av', objCollectionExt.offsetAv)
    varTable[3] = ('DM', objCollectionExt.offsetDM)
    for (i, filterName) in enumerate(objCollectionExt.filterNames):
        varTable[i+4] = (filterName, i+4)

    print(varTable)
    grpTables.create_dataset('varTable', data=varTable)

    grpData = f.create_group('Data')
    grpData.create_dataset('ChainData', data=extMagData)
    
    f.close()
Exemplo n.º 5
0
    def _save_run_results_hdf(outfile, results):
        # results: model_id timestamp class_labels (bin + roi_numbers)
        #          input_images output_classes output_scores

        with h5.File(outfile, 'w') as f:
            meta = f.create_dataset('metadata', data=h5.Empty('f'))
            meta.attrs['version'] = results['version']
            meta.attrs['model_id'] = results['model_id']
            meta.attrs['timestamp'] = results['timestamp']
            f.create_dataset('output_classes',
                             data=results['output_classes'],
                             compression='gzip',
                             dtype='float16')
            f.create_dataset('output_scores',
                             data=results['output_scores'],
                             compression='gzip',
                             dtype='float16')
            f.create_dataset('class_labels',
                             data=np.string_(results['class_labels']),
                             compression='gzip',
                             dtype=h5.string_dtype())
            if results['bin_id']:
                meta.attrs['bin_id'] = results['bin_id']
                f.create_dataset('roi_numbers',
                                 data=results['roi_numbers'],
                                 compression='gzip',
                                 dtype='uint16')
            else:
                f.create_dataset('input_images',
                                 data=np.string_(results['input_images']),
                                 compression='gzip',
                                 dtype=h5.string_dtype())
Exemplo n.º 6
0
def create_plate(h5_file_path: Path, n_images: int):
    """
    Allocate space for the hdf5 arrays on disk for a given plate.
    """

    with h5py.File(h5_file_path, "w") as h5_file:
        h5_file.attrs["timestamp"] = datetime.now().strftime(
            "%Y-%m-%d %H:%M:%S")
        h5_file.attrs["info"] = h5py.version.info
        h5_file.create_dataset("images", (n_images, ) + constants.IMG_SHAPE,
                               np.float16)
        h5_file.create_dataset("site", (n_images, ), np.uint8)
        h5_file.create_dataset("well", (n_images, ),
                               h5py.string_dtype(encoding="utf-8"))
        h5_file.create_dataset("replicate", (n_images, ), np.uint8)
        h5_file.create_dataset("plate", (n_images, ),
                               h5py.string_dtype(encoding="utf-8"))
        h5_file.create_dataset(
            "compound",
            (n_images, ),
            h5py.string_dtype(encoding="utf-8"),
        )
        h5_file.create_dataset("concentration", (n_images, ), np.float16)
        h5_file.create_dataset("moa", (n_images, ),
                               h5py.string_dtype(encoding="utf-8"))
Exemplo n.º 7
0
def dump_data(arr, ctypes, patients, idx, out_hdf):

    print("Saving results: ", out_hdf)

    with h5py.File(out_hdf, "w") as f_out:

        # Store the feature set
        rows = f_out.create_dataset("index",
                                    shape=idx.shape,
                                    dtype=h5py.string_dtype('utf-8'))
        rows[:] = idx

        leading = 0
        lagging = 0

        # For each cancer type:
        for ct, pat in zip(ctypes, patients):

            leading += len(pat)

            # Store the data values
            dset = f_out.create_dataset(ct + "/data",
                                        shape=(arr.shape[0], len(pat)))

            dset[:] = arr[:, lagging:leading]

            # Store the columns
            columns = f_out.create_dataset(ct + "/columns",
                                           shape=(len(pat), ),
                                           dtype=h5py.string_dtype('utf-8'))
            columns[:] = pat

            lagging = leading

    return
Exemplo n.º 8
0
 def dataToHdf(self, file):
     #записываем данные в ячейках
     file.create_dataset('Data', data=self.table_data)
     # записываем данные о заголовках
     headerData = np.array(self.columnHeaderData,
                           dtype=h5py.string_dtype(encoding='utf-8'))
     file.create_dataset('HeaderData',
                         data=headerData,
                         dtype=h5py.string_dtype(encoding='utf-8'))
Exemplo n.º 9
0
    def create_parameters(
        self,
        max: int = 10,
        method: str = "complete",
        extra: dict = {},
        force_create=False,
    ) -> str:
        filename = self.get_dataset_filename("data", "hdf5")
        if os.path.isfile(filename) and not force_create:
            print(
                "Parameter file exists, not recreating (use --regenerate_samples if you want to force)"
            )
            return filename
        print("+" * 40)
        print(f"Generating Dataset {self.name}, {max} examples")
        print(f"Datasets: {self.dataset_dir}")
        print("+" * 40)

        # Save out the parameters first
        self.save_parameters()

        # Generate the set of samples (could switch to generators,
        # but need to figure out arbitrary size arrays in HDF5)
        dataset: List[Sample] = []
        if method == "complete":
            dataset = self.parameters.recursively_generate_all()
        else:
            dataset = self.parameters.sample_space(sample_size=max)

        # Create the data file and add all the points to it
        with h5py.File(filename, "w") as datafile:
            # Figure out the sizes to store
            records = len(dataset)
            param_size = len(dataset[0].encode())

            # Add columns to it
            filenames = datafile.create_dataset("files", (records, ),
                                                dtype=h5py.string_dtype())
            parameters = datafile.create_dataset("parameters", (records, ),
                                                 dtype=h5py.string_dtype())
            labels = datafile.create_dataset("labels", (records, param_size))
            audio_exists = datafile.create_dataset("audio_exists", (records, ),
                                                   dtype=np.bool)

            # Generate the sample points
            for index, point in enumerate(dataset):
                params = self.parameters.to_settings(point)
                filenames[index] = self.get_wave_filename(index)
                labels[index] = point.encode()
                parameters[index] = json.dumps(params)
                audio_exists[index] = False
                if index % 1000 == 0:
                    print("Generating parameters for example {}".format(index))
            datafile.flush()
        datafile.close()

        return filename
Exemplo n.º 10
0
def write_array(file: IOBase, component: str, array: Array):
    # TODO : More validation on the inputs?
    group = get_write_group(file, component)
    for dataset in group:
        del group[dataset]
    group.create_dataset("array", data=array.data, track_times=False)
    if array.dimensions is not None:
        for i, dimension in enumerate(array.dimensions, start=1):
            if dimension.title is not None:
                group.create_dataset(
                    f"Dimension_{i}_title",
                    dtype=h5py.string_dtype(),
                    shape=(),
                    data=dimension.title,
                    track_times=False,
                )
            if dimension.names is not None:
                encoded_names = np.char.encode(dimension.names)
                group.create_dataset(
                    f"Dimension_{i}_names",
                    dtype=h5py.string_dtype(),
                    shape=encoded_names.shape,
                    data=encoded_names,
                    track_times=False,
                )
            if dimension.values is not None:
                values = np.array(dimension.values)
                group.create_dataset(
                    f"Dimension_{i}_values",
                    dtype=values.dtype,
                    shape=values.shape,
                    data=values,
                    track_times=False,
                )
            if dimension.units is not None:
                group.create_dataset(
                    f"Dimension_{i}_units",
                    dtype=h5py.string_dtype(),
                    shape=(),
                    data=dimension.units,
                    track_times=False,
                )
    if array.units is not None:
        group.create_dataset(
            "units",
            dtype=h5py.string_dtype(),
            shape=(),
            data=array.units,
            track_times=False,
        )
Exemplo n.º 11
0
def main():
    with h5py.File(file_path, 'w') as f:
        dataset = f.create_dataset('/group/dataset', shape=(3, 4), dtype='i')

        dataset[:] = [[1, 2, 3, 4], [5, 6, 7, 8], [9, 10, 11, 12]]

        dataset.attrs['double'] = math.pi

        hello = '早上好!'
        dataset.attrs['string-vlen'] = hello

        hello_utf8 = hello.encode('utf-8')
        hello_ascii = 'Hello, world!'

        dataset.attrs.create('string-ascii', hello_ascii, None,
                             '<S{0}'.format(len(hello_ascii)))

        utf8_type = h5py.string_dtype('utf-8', len(hello_utf8))
        # HDFView can not display the value of this attribute correctly, ViTables can.
        dataset.attrs.create('string', hello_utf8, None, utf8_type)

        dataset.attrs['boolean'] = True

        color_dt = h5py.enum_dtype({
            "RED": 0,
            "GREEN": 1,
            "BLUE": 42
        },
                                   basetype='i')
        dataset.attrs.create('color', 42, dtype=color_dt)
Exemplo n.º 12
0
    def write(self, file_name):
        """
        write the info to a file.
        if the universe has been processed, this information is also written

        :param file_name:
        :returns:
        :rtype:

        """

        dt = h5py.string_dtype(encoding="utf-8")

        with h5py.File(file_name, "w") as f:

            f.attrs["n_grbs"] = self._n_grbs
            f.attrs["is_processed"] = self._is_processed
            f.attrs["population_file"] = self._population_file

            grbs = f.create_dataset(
                "grb_saves", data=np.array(self._grb_save_files, dtype=dt)
            )

            if self._is_processed:

                grb_dets = f.create_dataset(
                    "grb_dets", data=np.array(self._grb_detector_files, dtype=dt)
                )
Exemplo n.º 13
0
 def flush(self):
     if self._writable:
         # only write `_NCProperties` in newly created files
         if not self._preexisting_file and not self.invalid_netcdf:
             _NC_PROPERTIES = "version=2,h5netcdf=%s,hdf5=%s,%s=%s" % (
                 __version__,
                 self._h5py.version.hdf5_version,
                 self._h5py.__name__,
                 self._h5py.__version__,
             )
             self.attrs._h5attrs["_NCProperties"] = np.array(
                 _NC_PROPERTIES,
                 dtype=h5py.string_dtype(
                     encoding="ascii", length=len(_NC_PROPERTIES)
                 ),
             )
         if self.invalid_netcdf:
             # see https://github.com/h5netcdf/h5netcdf/issues/165
             # warn user if .nc file extension is used for invalid netcdf features
             if os.path.splitext(self.filename)[1] == ".nc":
                 msg = (
                     f"You are writing invalid netcdf features to file "
                     f"`{self.filename}`. The file will thus be not conforming "
                     f"to NetCDF-4 standard and might not be readable by other "
                     f"netcdf tools. Consider using a different extension."
                 )
                 warnings.warn(msg, UserWarning, stacklevel=2)
             # remove _NCProperties if invalid_netcdf if exists
             if "_NCProperties" in self.attrs._h5attrs:
                 del self.attrs._h5attrs["_NCProperties"]
Exemplo n.º 14
0
    def save(self, save_filename=None):
        # *************************
        # *** Save data to HDF5 ***
        # *************************
        if save_filename is None:
            script_path = os.path.realpath(__file__)  # full path of current script
            current_dir, script_basename = os.path.split(script_path)
            script_filename = os.path.splitext(script_basename)[0]  # name of current script
            timestamp = time.strftime("%Y%m%d_%H%M%S", time.localtime())  # current date and time
            save_basename = f"{script_filename:s}_{timestamp:s}.h5"  # name of save file
            save_path = os.path.join(current_dir, "data", save_basename)  # full path of save file
        else:
            save_path = os.path.realpath(save_filename)

        source_code = get_sourcecode(__file__)  # save also the sourcecode of the script for future reference
        with h5py.File(save_path, "w") as h5f:
            dt = h5py.string_dtype(encoding='utf-8')
            ds = h5f.create_dataset("source_code", (len(source_code), ), dt)
            for ii, line in enumerate(source_code):
                ds[ii] = line

            for attribute in self.__dict__:
                print(f"{attribute}: {self.__dict__[attribute]}")
                if attribute.startswith("_"):
                    # don't save private attributes
                    continue
                if attribute == "jpa_params":
                    h5f.attrs[attribute] = str(self.__dict__[attribute])
                elif np.isscalar(self.__dict__[attribute]):
                    h5f.attrs[attribute] = self.__dict__[attribute]
                else:
                    h5f.create_dataset(attribute, data=self.__dict__[attribute])
        print(f"Data saved to: {save_path}")
        return save_path
Exemplo n.º 15
0
def _create_dataset(file_handle, ds_name, data):
    data = _convert_list(data)

    try:

        if issubclass(data.dtype.type, bytes):
            data = np.void(data)  # byte strings aren't handled properly
        dataset = \
            file_handle.create_dataset(ds_name,
                                       maxshape=(None,) + data.shape[1:],
                                       data=data)
    except TypeError:
        if issubclass(data.dtype.type, str):
            dtype = h5py.string_dtype(encoding='utf-8')
        else:
            raise TypeError

        dataset = file_handle.create_dataset(ds_name,
                                             shape=data.shape,
                                             maxshape=(None, ) +
                                             data.shape[1:],
                                             dtype=dtype)
        dataset[:] = data

    return dataset
Exemplo n.º 16
0
 def _save_validation_results_hdf(self, outfile, results):
     attrib_data = ['model_id', 'timestamp']
     attrib_data += 'f1_weighted recall_weighted precision_weighted f1_macro recall_macro precision_macro'.split(
     )
     int_data = [
         'input_classes', 'output_classes'
     ] + 'counts_perclass val_counts_perclass train_counts_perclass'.split(
     )
     int_data.extend([
         'classes_by_' + stat
         for stat in 'f1 recall precision count'.split()
     ])
     string_data = ['class_labels', 'image_fullpaths', 'image_basenames']
     with h5.File(outfile, 'w') as f:
         meta = f.create_dataset('metadata', data=h5.Empty('f'))
         for series in results:
             if series in attrib_data: meta.attrs[series] = results[series]
             elif series in string_data:
                 f.create_dataset(series,
                                  data=np.string_(results[series]),
                                  compression='gzip',
                                  dtype=h5.string_dtype())
             elif series in int_data:
                 f.create_dataset(series,
                                  data=results[series],
                                  compression='gzip',
                                  dtype='int16')
             elif isinstance(results[series], np.ndarray):
                 f.create_dataset(series,
                                  data=results[series],
                                  compression='gzip',
                                  dtype='float16')
             else:
                 raise UserWarning(
                     'hdf results: WE MISSED THIS ONE: {}'.format(series))
Exemplo n.º 17
0
    def save_collection(self, dest: Path) -> None:
        """Save the collection (queries and documents). Use the unique integer IDs for queries and documents.
        The original IDs can be recovered through a mapping that is also saved.

        Args:
            dest (Path): The file to create.
        """
        str_dt = h5py.string_dtype(encoding="utf-8")
        with h5py.File(dest, "w") as fp:
            ds = {
                "queries":
                fp.create_dataset("queries", (len(self.queries), ),
                                  dtype=str_dt),
                "orig_q_ids":
                fp.create_dataset("orig_q_ids", (len(self.orig_q_ids), ),
                                  dtype=str_dt),
                "docs":
                fp.create_dataset("docs", (len(self.docs), ), dtype=str_dt),
                "orig_doc_ids":
                fp.create_dataset("orig_doc_ids", (len(self.orig_doc_ids), ),
                                  dtype=str_dt),
            }
            for q_id, query in tqdm(self.queries.items(),
                                    desc="Saving queries"):
                ds["queries"][q_id] = query
                ds["orig_q_ids"][q_id] = self.orig_q_ids[q_id]

            for doc_id, doc in tqdm(self.docs.items(),
                                    desc="Saving documents"):
                ds["docs"][doc_id] = doc
                ds["orig_doc_ids"][doc_id] = self.orig_doc_ids[doc_id]
Exemplo n.º 18
0
    def to_hdf(self, hdf, group_name="structures"):
        # truncate arrays to necessary size before writing
        self._resize_atoms(self.num_atoms)
        self._resize_structures(self.num_structures)

        with hdf.open(group_name) as hdf_s_lst:
            self._type_to_hdf(hdf_s_lst)
            hdf_s_lst["num_atoms"] = self._num_atoms_alloc
            hdf_s_lst["num_structures"] = self._num_structures_alloc

            hdf_arrays = hdf_s_lst.open("arrays")
            for k, a in chain(self._per_atom_arrays.items(),
                              self._per_structure_arrays.items()):
                if a.dtype.char == "U":
                    # numpy stores unicode data in UTF-32/UCS-4, but h5py wants UTF-8, so we manually encode them here
                    # TODO: string arrays with shape != () not handled
                    hdf_arrays[k] = np.array(
                        [s.encode("utf8") for s in a],
                        # each character in a utf8 string might be encoded in up to 4 bytes, so to
                        # make sure we can store any string of length n we tell h5py that the
                        # string will be 4 * n bytes; numpy's dtype does this calculation already
                        # in itemsize, so we don't need to repeat it here
                        # see also https://docs.h5py.org/en/stable/strings.html
                        dtype=h5py.string_dtype('utf8', a.dtype.itemsize))
                else:
                    hdf_arrays[k] = a
Exemplo n.º 19
0
    def tokenize(self, input_hdf5_group: str, input_hdf5_dataset: str,
                 output_hdf5_group: str, output_hdf5_dataset_tokenized: str,
                 output_hdf5_dataset_tokenized_id: str) -> None:
        with h5py.File(self.hdf5_path, "a") as hdf5_store:
            hdf5_group = hdf5_store.get(input_hdf5_group)
            captions = numpy.array(hdf5_group[input_hdf5_dataset])

            captions_tokenized = []
            captions_tokenized_id = []

            for caption in tqdm(captions):
                caption_tokenized = (
                    self.tokenizer.encode_with_bos_eos(caption))
                caption_tokenized_id = (
                    self.tokenizer.encode_ids_with_bos_eos(caption))
                captions_tokenized.append(caption_tokenized)
                captions_tokenized_id.append(caption_tokenized_id)

            if output_hdf5_dataset_tokenized in hdf5_group.keys():
                del hdf5_group[output_hdf5_dataset_tokenized]
            if output_hdf5_dataset_tokenized_id in hdf5_group.keys():
                del hdf5_group[output_hdf5_dataset_tokenized_id]

            hdf5_group.create_dataset(
                output_hdf5_dataset_tokenized,
                data=numpy.array(captions_tokenized,
                                 dtype=h5py.string_dtype(encoding="utf-8")))
            token_id_dataset = hdf5_group.create_dataset(
                output_hdf5_dataset_tokenized_id,
                shape=(len(captions_tokenized_id), ),
                dtype=h5py.vlen_dtype(numpy.dtype("int32")))
            token_id_dataset[...] = captions_tokenized_id
Exemplo n.º 20
0
    def test_fixed_ascii(self):
        dt = h5py.string_dtype(encoding='ascii', length=10)

        string_info = h5py.check_string_dtype(dt)
        assert string_info.encoding == 'ascii'
        assert string_info.length == 10
        assert h5py.check_vlen_dtype(dt) is None
Exemplo n.º 21
0
    def normalize_whitespace(self, input_hdf5_group: str,
                             input_hdf5_dataset: str, output_hdf5_group: str,
                             output_hdf5_dataset: str) -> None:
        self.logger.info("Normalizing whitespace. Data transfer:\n" +
                         f"\"{input_hdf5_group}/{input_hdf5_dataset}\" -> " +
                         f"\"{output_hdf5_group}/{output_hdf5_dataset}\".")

        self.regex_substitution(self.whitespace_regex,
                                self.whitespace_placeholder, input_hdf5_group,
                                input_hdf5_dataset, output_hdf5_group,
                                output_hdf5_dataset)

        with h5py.File(self.hdf5_path, "a") as hdf5_store:
            captions = numpy.array(
                hdf5_store.get(input_hdf5_group).get(input_hdf5_dataset))

            captions_cleaned = []

            for caption in captions:
                caption_cleaned = caption.strip()
                captions_cleaned.append(caption_cleaned)

            output_group = hdf5_store.require_group(output_hdf5_group)
            if output_hdf5_dataset in output_group.keys():
                del output_group[output_hdf5_dataset]
            output_group.create_dataset(
                output_hdf5_dataset,
                data=numpy.array(captions_cleaned,
                                 dtype=h5py.string_dtype(encoding="utf-8")))
Exemplo n.º 22
0
    def test_vlen_ascii(self):
        dt = h5py.string_dtype(encoding='ascii')

        string_info = h5py.check_string_dtype(dt)
        assert string_info.encoding == 'ascii'
        assert string_info.length is None
        assert h5py.check_vlen_dtype(dt) is bytes
Exemplo n.º 23
0
    def test_fixed_utf8(self):
        dt = h5py.string_dtype(length=10)

        string_info = h5py.check_string_dtype(dt)
        assert string_info.encoding == 'utf-8'
        assert string_info.length == 10
        assert h5py.check_vlen_dtype(dt) is None
Exemplo n.º 24
0
def create_files_and_datasets(params, samples_folder):
    """
    Function to create the hdfs files (trn, val and tst).
    :param params: (dict) Parameters found in the yaml config file.
    :param samples_folder: (str) Path to the output folder.
    :return: (hdf5 datasets) trn, val ant tst datasets.
    """
    samples_size = params['global']['samples_size']
    number_of_bands = params['global']['number_of_bands']
    meta_map = get_key_def('meta_map', params['global'], {})
    real_num_bands = number_of_bands - MetaSegmentationDataset.get_meta_layer_count(
        meta_map)
    assert real_num_bands > 0, "invalid number of bands when accounting for meta layers"
    hdf5_files = []
    for subset in ["trn", "val", "tst"]:
        hdf5_file = h5py.File(
            os.path.join(samples_folder, f"{subset}_samples.hdf5"), "w")
        hdf5_file.create_dataset(
            "sat_img", (0, samples_size, samples_size, real_num_bands),
            np.float32,
            maxshape=(None, samples_size, samples_size, real_num_bands))
        hdf5_file.create_dataset("map_img", (0, samples_size, samples_size),
                                 np.int16,
                                 maxshape=(None, samples_size, samples_size))
        hdf5_file.create_dataset("meta_idx", (0, 1),
                                 dtype=np.int16,
                                 maxshape=(None, 1))
        hdf5_file.create_dataset("metadata", (0, 1),
                                 dtype=h5py.string_dtype(),
                                 maxshape=(None, 1))
        hdf5_files.append(hdf5_file)
    return hdf5_files
Exemplo n.º 25
0
    def test_vlen_utf8(self):
        dt = h5py.string_dtype()

        string_info = h5py.check_string_dtype(dt)
        assert string_info.encoding == 'utf-8'
        assert string_info.length is None
        assert h5py.check_vlen_dtype(dt) is str
Exemplo n.º 26
0
    def preproc(self):
        ''' proprocess data 
        '''
        if self.source != 'loseit': raise NotImplementedError

        import glob
        import pandas as pd

        fweeks = glob.glob(os.path.join(dat_dir, 'WeeklySummary*.csv'))

        for i, fweek in enumerate(fweeks):
            week = pd.read_csv(fweek)
            if i == 0:
                cols = week.columns
                data = [np.array(week[col]) for col in cols]
            else:
                assert np.array_equal(np.array(week.columns), np.array(cols))

                for i in range(len(week.columns)):
                    data[i] = np.concatenate([data[i], week[cols[i]]])

        # save to hdf5
        h5 = h5py.File(os.path.join(dat_dir, '%s.hdf5' % self.source), 'w')
        # no meta data for now
        for i, col in enumerate(cols):
            if isinstance(data[i][0], str):
                h5.create_dataset(col, data=data[i], dtype=h5py.string_dtype())
            else:
                h5.create_dataset(col, data=data[i])
        h5.close()
        return None
Exemplo n.º 27
0
    def test_fixed_utf8(self):
        dt = h5py.string_dtype(length=10)

        string_info = h5py.check_string_dtype(dt)
        assert string_info.encoding == 'utf-8'
        assert string_info.length == 10
        assert h5py.check_vlen_dtype(dt) is None
Exemplo n.º 28
0
def create_h5(df, hdf5name):
    # Exception occurs when files already exist.
    with h5py.File(hdf5name, 'w-') as f:
        dt_string = h5py.string_dtype()
        dset_fullname_creator = f.create_dataset('fullname_creator', (len(df.index),), dtype=dt_string)
        dset_material = f.create_dataset('material', (len(df.index),), dtype=dt_string)
        dset_type = f.create_dataset('type', (len(df.index),), dtype=dt_string)

        dset_fullname_creator_cat = f.create_dataset('fullname_creator_cat', (len(df.index),), dtype=dt_string)
        dset_material_cat = f.create_dataset('material_cat', (len(df.index),), dtype=dt_string)
        dset_type_cat = f.create_dataset('type_cat', (len(df.index),), dtype=dt_string)

        dt_uint8 = h5py.special_dtype(vlen=np.dtype('uint8'))
        dset_img = f.create_dataset('images', (len(df.index),), dtype=dt_uint8)
        
        for i, r in df.iterrows():
            filename = str(r['filename'])
            print(f'[{i}]: {filename}')
            with open(images_path+filename, 'rb') as fin:
                dset_img[i] = np.frombuffer(fin.read(), dtype='uint8')
            
            dset_fullname_creator[i] = r['fullname_creator']
            dset_fullname_creator_cat[i] = r['fullname_creator_cat']

            dset_material[i] = r['material']
            dset_material_cat[i] = r['material_cat']

            dset_type[i] = r['type']
            dset_type_cat[i] = r['type_cat']
    print('Done')
Exemplo n.º 29
0
    def test_fixed_ascii(self):
        dt = h5py.string_dtype(encoding='ascii', length=10)

        string_info = h5py.check_string_dtype(dt)
        assert string_info.encoding == 'ascii'
        assert string_info.length == 10
        assert h5py.check_vlen_dtype(dt) is None
Exemplo n.º 30
0
    def run(self) -> None:
        # TODO: check whether cache exists
        self.logger.info(f"Tokenizing caption data.")

        with h5py.File(self.hdf5_path, "a") as hdf5_store:
            for hdf5_group_name in self.raw_data_group_names.values():
                hdf5_group = hdf5_store.get(hdf5_group_name)
                captions = numpy.array(hdf5_group["caption_cleaned"])

                captions_tokenized = []
                captions_tokenized_id = []

                for caption in captions:
                    caption_tokenized = (
                        self.tokenizer.encode_with_bos_eos(caption))
                    caption_tokenized_id = (
                        self.tokenizer.encode_ids_with_bos_eos(caption))
                    captions_tokenized.append(caption_tokenized)
                    captions_tokenized_id.append(caption_tokenized_id)

                if "caption_cleaned_tokenized" in hdf5_group.keys():
                    del hdf5_group["caption_cleaned_tokenized"]
                if "caption_cleaned_tokenized_id" in hdf5_group.keys():
                    del hdf5_group["caption_cleaned_tokenized_id"]

                hdf5_group.create_dataset(
                    "caption_cleaned_tokenized",
                    data=numpy.array(
                        captions_tokenized,
                        dtype=h5py.string_dtype(encoding="utf-8")))
                token_id_dataset = hdf5_group.create_dataset(
                    "caption_cleaned_tokenized_id",
                    shape=(len(captions_tokenized_id), ),
                    dtype=h5py.vlen_dtype(numpy.dtype("int32")))
                token_id_dataset[...] = captions_tokenized_id
Exemplo n.º 31
0
    def get_result(self, discard=True):
        '''Get the result associated with this future, blocking until it is available.
        If ``discard`` is true, then removes the reference to the result contained
        in this instance, so that a collection of futures need not turn into a cache of
        all associated results.'''
        with self._condition:
            if self._done:
                if self._exception:
                    if isinstance(self._traceback,
                                  h5py.string_dtype(encoding='utf-8')):
                        if self._traceback:
                            log.error(
                                'uncaught exception in remote function\n{}'.
                                format(self._traceback))
                        raise self._exception
                    else:
                        raise self._exception.with_traceback(self._traceback)
            else:
                self._condition.wait()
                assert self._done
                if self._exception:
                    if isinstance(self._traceback, str):
                        log.error(
                            'uncaught exception in remote function\n{}'.format(
                                self._traceback))
                        raise self._exception
                    else:
                        raise self._exception.with_traceback(self._traceback)

            result = self._result
            if discard:
                del self._result
            return result
Exemplo n.º 32
0
 def test_bytestr(self):
     """ Indexing a byte string dataset returns a real python byte string
     """
     dset = self.f.create_dataset('x', (1, ),
                                  dtype=h5py.string_dtype(encoding='ascii'))
     dset[0] = b"Hello there!"
     self.assertEqual(type(dset[0]), bytes)
Exemplo n.º 33
0
    def test_vlen_utf8(self):
        dt = h5py.string_dtype()

        string_info = h5py.check_string_dtype(dt)
        assert string_info.encoding == 'utf-8'
        assert string_info.length is None
        assert h5py.check_vlen_dtype(dt) is six.text_type
Exemplo n.º 34
0
    def test_vlen_ascii(self):
        dt = h5py.string_dtype(encoding='ascii')

        string_info = h5py.check_string_dtype(dt)
        assert string_info.encoding == 'ascii'
        assert string_info.length is None
        assert h5py.check_vlen_dtype(dt) is bytes
Exemplo n.º 35
0
    def test_compound(self):

        fields = []
        fields.append(('field_1', h5py.string_dtype()))
        fields.append(('field_2', np.int32))
        dt = np.dtype(fields)
        self.f['mytype'] = np.dtype(dt)
        dt_out = self.f['mytype'].dtype.fields['field_1'][0]
        string_inf = h5py.check_string_dtype(dt_out)
        self.assertEqual(string_inf.encoding, 'utf-8')
Exemplo n.º 36
0
    def test_vlen_string_array(self):
        """ Storage of vlen byte string arrays"""
        dt = h5py.string_dtype(encoding='ascii')

        data = np.ndarray((2,), dtype=dt)
        data[...] = b"Hello", b"Hi there!  This is HDF5!"

        self.f.attrs['x'] = data
        out = self.f.attrs['x']
        self.assertEqual(out.dtype, dt)
        self.assertEqual(out[0], data[0])
        self.assertEqual(out[1], data[1])