def write_imagespec(spec: ImageSpec, hfile: tables.File) -> None: hfile.root._v_attrs.crs = spec.crs hfile.create_array(hfile.root, name="x_coordinates", obj=spec.x_coordinates) hfile.create_array(hfile.root, name="y_coordinates", obj=spec.y_coordinates)
def _dump_to_h5(stream: BytesIO, store: tables.File, file_size: int, date: datetime.date): """Convert and dump to h5 Args: stream (InputStream) : input stream store (tables.File) : pytable output file_size (int) : size of the file date (date) : date of the file """ out_price = dict() out_volume = dict() date_offset_epoch = datetime.datetime.fromordinal( date.toordinal()).timestamp() with tqdm(total=file_size, desc="Streaming", unit="B", unit_scale=1, ncols=100) as pbar: while True: chunk = _load_chunk(stream) if chunk is None: break payload, exchange, session, category, security, chunk_size = chunk key = (exchange, security) for typ, row in _parse_chunk(payload, date_offset_epoch): if typ == b"4P": if key not in out_price: out_price[key] = store.create_earray( "/price", _get_security_code(exchange, security), obj=[list(row)], createparents=True, ) else: out_price[key].append([list(row)]) elif typ == b"VL": if key not in out_volume: out_volume[key] = store.create_earray( "/volume", _get_security_code(exchange, security), obj=[list(row)], createparents=True, ) else: out_volume[key].append([list(row)]) pbar.update(chunk_size)
def get_nodes(h5file: tables.File, group: tables.Group) -> dict: result = {} logging.debug("Walk group: {}".format(group)) for leaf in h5file.iter_nodes(group, classname="Leaf"): logging.debug("Add leaf: {}".format(leaf.name)) result[leaf.name] = leaf for group in h5file.iter_nodes(group, classname="Group"): result[group._v_name] = get_nodes(h5file, group) return result
def _write_categorical_target_metadata(meta: CategoricalTarget, hfile: tables.File) -> None: hfile.root.categorical_data.attrs.D = meta.D hfile.root.categorical_data.attrs.N = meta.N _make_str_vlarray(hfile, "categorical_labels", meta.labels) _make_int_vlarray(hfile, "categorical_counts", meta.counts) _make_int_vlarray(hfile, "categorical_mappings", meta.mappings) hfile.create_array(hfile.root, name="categorical_nvalues", obj=meta.nvalues)
def _save_list_of_estimators( hdf_file: tables.File, group: tables.Group, estimator_list: List[BaseEstimator], fitted: bool, ): hdf_file.set_node_attr(group, "__type__", GroupType.LIST_OF_ESTIMATORS.name) hdf_file.set_node_attr(group, "len", len(estimator_list)) for i, estimator in enumerate(estimator_list): sub_group = hdf_file.create_group(group, f"item_{i}") _save_estimator_to_group(hdf_file, sub_group, estimator, fitted)
def __init__(self, h5file: tables.File, parent_group: tables.Group, exp_id: str, exp_title: str, variables: Mapping[str, VarType]): super(_ExperimentWriter, self).__init__() self._id = exp_id self._title = exp_title self._file = h5file try: self._group = h5file.create_group(parent_group, exp_id, title=exp_title) except tables.NodeError: try: node = h5file.get_node(parent_group, exp_id) path = node._v_pathname if isinstance(node, tables.Group): raise ExperimentElementError('Experiment already exists at' f'{path} in file {h5file}.') elif isinstance(node, tables.Table): raise ExperimentElementError('Name conflict: variable ' 'table already exists at ' f'{path} in file {h5file}') else: raise ExperimentElementError(f'Conflict at {path} ' f'in file {h5file}') except tables.NoSuchNodeError as e: raise ExperimentElementError() from e # metadata self._group._v_attrs.created = datetime.datetime.now().isoformat() self._group._v_attrs.finished = 'unfinished' self._var_tables = dict() for var_name, var_type in variables.items(): try: tbl = h5file.create_table(self._group, var_name, description={ 'record_time': tables.Time64Col(), 'experiment_time': tables.Time64Col(), 'value': _vartype_columns[var_type]() }) self._var_tables[var_name] = tbl except KeyError: raise UnsupportedVariableType(var_type) self._sub_experiments = dict()
def _write_categorical_metadata(meta: CategoricalFeatureSet, hfile: tables.File) -> None: hfile.root.categorical_data.attrs.missing = meta.missing_value labels = [k for k in meta.columns.keys()] nvalues = np.array([v.nvalues for v in meta.columns.values()]) D = np.array([v.D for v in meta.columns.values()]) mappings = [v.mapping for v in meta.columns.values()] counts = [v.counts for v in meta.columns.values()] _make_str_vlarray(hfile, "categorical_labels", labels) hfile.create_array(hfile.root, name="categorical_D", obj=D) _make_int_vlarray(hfile, "categorical_counts", counts) _make_int_vlarray(hfile, "categorical_mappings", mappings) hfile.create_array(hfile.root, name="categorical_nvalues", obj=nvalues)
def _write_continuous_metadata(meta: ContinuousFeatureSet, hfile: tables.File) -> None: hfile.root.continuous_data.attrs.missing = meta.missing_value hfile.root.continuous_data.attrs.normalised = meta.normalised labels = [k for k in meta.columns.keys()] D = np.array([v.D for v in meta.columns.values()], dtype=int) means = [v.mean for v in meta.columns.values()] sds = [v.sd for v in meta.columns.values()] _make_str_vlarray(hfile, "continuous_labels", labels) hfile.create_array(hfile.root, name="continuous_D", obj=D) if meta.normalised: _make_float_vlarray(hfile, "continuous_means", means) _make_float_vlarray(hfile, "continuous_sds", sds)
def _safe_col_str_change(self, h5: tb.File, colpath: str, coldtype: str, data: (list, tuple), resize: bool): colnode = self._get_col(h5, colpath) if colnode is None: raise Exception(f"Table column doesn't exist: {colpath} in {self._h5file}") m = re.match(r'[osc](\d+)', coldtype) if not m: raise Exception(f'Col dtype for column {colpath} should be [osc] and is not: {coldtype} in {self._h5file}') size = int(m.group(1)) m = re.search(r's(\d+)', str(data.dtype).lower()) if not m: raise Exception(f'Data in column {colpath} should be type similar to {coldtype} and is not ' f'in {self._h5file}') dlen = int(m.group(1)) if dlen > size: if not resize: msg = f"Data corruption happening in {colpath}. Table may be corrupted." \ f"Serialized data len ({dlen}) > ({size + 1}) in {self._h5file}" raise Exception(msg) else: logging.warning(f"Changing column size to {dlen} and overwriting ... {colpath}") # safely rewrite data at expense of memory and time tmp_col_path = colpath + '_tmp' # create new column of desired shape newcolnode = self._create_column(h5, tmp_col_path, atom=tb.StringAtom(dlen)) # re-write all data into that column # newcolnode.append(colnode[:]) for idx, row in enumerate(colnode.iterrows()): newcolnode.append([row]) # clean up old path and point to new path h5.remove_node(colpath) colname = os.path.basename(colpath) table_path = os.path.dirname(colpath) h5.rename_node(tmp_col_path, colname) # update attributes colnode = self._get_col(h5, colpath) m = re.search(r'S(\d+)', str(colnode.dtype)) new_len = int(m.group(1)) simpah5_attrs = self._read_attrs(h5, table_path) col_dtypes = simpah5_attrs[ATTR_COLDTYPE] m = re.match(r'([os])', col_dtypes[colname]) col_dtypes[colname] = f'{m.group(1)}{new_len}' _ = self._write_attrs(h5, table_path, ATTR_COLDTYPE, col_dtypes) return colnode
def _add_agent(results_file: tables.File, agent_name: str) -> None: """ Implementation for `add_agent`. See that function for a description of this code. See Also -------- add_agent : The externally callable wrapper for this function """ avt: tables.Table try: avt = results_file.get_node("/agents", agent_name) except tables.NoSuchNodeError: avt = results_file.create_table("/agents", agent_name, AgentVersionRow, createparents=True) n_versions = avt.nrows t_uploaded = time.time() t_str = time.ctime() av_row = avt.row av_row["version"] = n_versions av_row["uploaded/time_str"] = t_str av_row["uploaded/time_sec"] = t_uploaded av_row.append() avt.flush() found = False for ac_row in results_file.root.current.where( f'(name == b"{agent_name}")'): assert ( not found ), f"There was more than one row with the same agent name: {agent_name}" found = True ac_row["version"] = n_versions ac_row["uploaded/time_str"] = t_str ac_row["uploaded/time_sec"] = t_uploaded ac_row.update() if not found: ac_row = results_file.root.current.row ac_row["name"] = agent_name ac_row["version"] = n_versions ac_row["uploaded/time_str"] = t_str ac_row["uploaded/time_sec"] = t_uploaded ac_row.append() results_file.root.current.flush()
def _add_array(file: tables.File, where: tables.Group, name: str, *lists: HomList) -> None: """ Adds an homogeneous array to a tables file, where the array is filled with the contents of a set of lists. Each list in *lists is copied into a column of the resulting tables.CArray using the same ordering as *lists. Parameters ---------- file : tables.File where : tables.Group name : str lists : list of lists, each containing the same scalar data type (e.g. float, int) """ arrays = [np.array(ll) for ll in lists] nda = np.empty((max(1, max(a.size for a in arrays)), len(arrays)), dtype=arrays[0].dtype) nda.fill(-1) for i, a in enumerate(arrays): nda[:a.size, i] = a ca: tables.CArray = file.create_carray( where, name, tables.Atom.from_dtype(nda.dtype), nda.shape, filters=compression_filter, ) ca[...] = nda[...]
def _make_float_vlarray(h5file: tables.File, name: str, attribute: np.ndarray) -> None: vlarray = h5file.create_vlarray(h5file.root, name=name, atom=tables.Float64Atom(shape=())) for a in attribute: vlarray.append(a)
def _make_str_vlarray(h5file: tables.File, name: str, attribute: List[str]) -> None: vlarray = h5file.create_vlarray(h5file.root, name=name, atom=tables.VLStringAtom()) for a in attribute: vlarray.append(a)
def calculate_from_node(self, h5file: File, group: Group): table_gamma = h5file.get_node(group, "gamma") number = table_gamma.attrs["values_macros_number"] height = table_gamma.attrs["values_gdml_height"] cell = table_gamma.attrs["values_gdml_cellHeight"] field = table_gamma.attrs["values_gdml_fieldValueZ"] table_positron = h5file.get_node(group, "positron") pos_fb = self.calculate(table_positron.read()) gamma_fb = self.calculate(table_gamma.read()) gamma_err = (gamma_fb**0.5) / number pos_err = (pos_fb**0.5) / number gamma_fb = gamma_fb / number pos_fb = pos_fb / number return FeedBack(height, field, cell, gamma_fb, pos_fb, gamma_err, pos_err, number)
def _exists_node(self, h5: tb.File, nodepath: str) -> tb.Leaf: nodepath = self._norm_path(nodepath) try: node = h5.get_node(nodepath) except tb.NoSuchNodeError: node = None return node
def _get_node(self, h5: tb.File, nodepath: str) -> tb.Leaf: nodepath = self._norm_path(nodepath) try: node = h5.get_node(nodepath) except KeyError: node = None return node
def _save_params_to_group(hdf_file: tables.File, group: tables.Group, params_dict: dict, fitted: bool): for param_name, param_value in params_dict.items(): if is_estimator(param_value): param_group = hdf_file.create_group(group, param_name) _save_estimator_to_group(hdf_file, param_group, param_value, fitted) elif is_list_of_named_estimators(param_value): param_group = hdf_file.create_group(group, param_name) _save_list_of_named_estimators(hdf_file, param_group, param_value, fitted) elif is_list_of_estimators(param_value): param_group = hdf_file.create_group(group, param_name) _save_list_of_estimators(hdf_file, param_group, param_value, fitted) else: hdf_file.set_node_attr(group, param_name, param_value)
def parse_gctx(infile): fid = File(infile, 'r') mat = fid.getNode('/0/DATA/0', 'matrix').read() rid = fid.getNode('/0/META/ROW', 'id').read() cid = fid.getNode('/0/META/COL', 'id').read() fid.close() return {'matrix': mat, 'rid': rid, 'cid': cid}
def _create_column(self, h5: tb.File, colpath: str, atom: Optional[tb.Atom]=None, expectedrows: int=10000, shape: Optional[tuple]=None, data: (list, tuple, np.ndarray)=None) -> tb.EArray: # create an EArray column and return the created node if data is None and shape is None: shape = (0,) if data is not None and not isinstance(data, np.ndarray) and isinstance(data[0], str): data = [x.encode('utf-8') for x in data] return h5.create_earray( os.path.dirname(colpath), os.path.basename(colpath), obj=data, createparents=True, atom=atom, shape=shape, expectedrows=expectedrows, filters=self._filters )
def write_coordinates(array_src: CoordinateArraySource, h5file: tables.File, batchsize: int) -> None: with array_src: shape = array_src.shape[0:1] atom = tables.Float64Atom(shape=(array_src.shape[1], )) filters = tables.Filters(complevel=1, complib="blosc:lz4") array = h5file.create_carray(h5file.root, name="coordinates", atom=atom, shape=shape, filters=filters) _make_str_vlarray(h5file, "coordinates_columns", array_src.columns) array.attrs.missing = array_src.missing for s in batch_slices(batchsize, array_src.shape[0]): array[s.start:s.stop] = array_src(s)
def __call__(self, path: str, h5file: File, group: Group): data = np.fromfile(path, dtype=self.dtype) self.tableName = self.filename[:self.filename.rfind('.')] if ("e-" in self.tableName): self.tableName = self.tableName.replace("e-", "electron") if ("e+" in self.tableName): self.tableName = self.tableName.replace("e+", "positron") # if data.size == 0: # my_table = h5file.create_table(group, self.tableName, desc **self.settings) # else: my_table = h5file.create_table(group, self.tableName, obj=data, **self.settings) my_table.flush()
def __call__(self, path: str, h5file: File, group: Group): data = np.loadtxt(path, **self.kwargs) if data.size == 1: data = data.reshape((1, )) self.tableName = self.filename[:self.filename.rfind('.')] try: my_table = h5file.create_table(group, self.tableName, obj=data, **self.settings) except IndexError: print(group, self.tableName, data, data.size, type(data), data.shape) raise my_table.flush()
def upload_file(): """Upload a file to the database""" if not current_user.is_authenticated: flash("You need to log in or sign up before uploading file.") return redirect(url_for("login")) form = UploadFileForm() if form.validate_on_submit(): pic = form.file.data filename = generate_filename(File, pic.filename) mimetype = pic.mimetype file = File(filename=filename, file=pic.read(), mimetype=mimetype, file_owner=current_user) db.session.add(file) db.session.commit() return redirect(url_for("get_files")) return render_template("upload-img.html", form=form, logged_in=True, file=True)
def _save_list_of_named_estimators( hdf_file: tables.File, group: tables.Group, estimator_list: List[Tuple[str, BaseEstimator, Any]], fitted: bool, ): hdf_file.set_node_attr(group, "__type__", GroupType.LIST_OF_NAMED_ESTIMATORS.name) hdf_file.set_node_attr(group, "names", [n for (n, e, *r) in estimator_list]) hdf_file.set_node_attr(group, "rests", [r for (n, e, *r) in estimator_list]) for (name, estimator, *_rest) in estimator_list: sub_group = hdf_file.create_group(group, name) _save_estimator_to_group(hdf_file, sub_group, estimator, fitted)
def _write_source(src: ArraySource, hfile: tables.File, atom: tables.Atom, name: str, transform: Worker, n_workers: int, batchrows: Optional[int] = None) -> None: front_shape = src.shape[0:-1] filters = tables.Filters(complevel=1, complib="blosc:lz4") array = hfile.create_carray(hfile.root, name=name, atom=atom, shape=front_shape, filters=filters) array.attrs.missing = src.missing batchrows = batchrows if batchrows else src.native log.info("Writing {} to HDF5 in {}-row batches".format(name, batchrows)) _write(src, array, batchrows, n_workers, transform)
def _get_total_game_count(results_file: tables.File, agent_name: str) -> int: """ Parameters ---------- results_file : tables.File agent_name : str Returns ------- game_count : int Total games played across all versions of the agent """ game_count = 0 try: vt: tables.Table = results_file.get_node(results_file.root.agents, agent_name) for col in ("won", "lost", "drawn", "failed"): game_count += np.sum(vt.col(col)) except tables.NoSuchNodeError: return -1 return game_count
def _add_vlarray( file: tables.File, where: tables.group, name: str, to_store: List[Union[str, HomList]], ) -> None: """ Adds a ragged array to a tables file. Each row in the array is populated by an element of `to_store`, which can contain either strings or lists of any of the types supported by PyTables. This includes floats, integers and other scalar data types. Parameters ---------- file : tables.File where : tables.Group name : str to_store : list of either str or lists of scalars """ if to_store: if isinstance(to_store[0], str): to_store = [s.encode("utf-8") for s in to_store] atom = tables.VLStringAtom() else: to_store = [np.array(ll) for ll in to_store] atom = tables.Atom.from_dtype(to_store[0].dtype) else: atom = tables.StringAtom(itemsize=1) vla = file.create_vlarray( where, name, atom=atom, filters=compression_filter, expectedrows=len(to_store), ) for s in to_store: vla.append(s)
def _save_validation_to_group( hdf_file: tables.File, group: tables.Group, estimator: BaseEstimator, validation_func: str, validation_data: Any, is_validation_array: bool, ): hdf_file.set_node_attr(group, "validation_func", validation_func) if is_validation_array: # this mode handle well large inputs, but might cast the array, and don't work with mixed type arrays _save_array_to_group(hdf_file, group, "X", "input", validation_data) y = getattr(estimator, validation_func)(group["X"]) _save_array_to_group(hdf_file, group, "y", "expected_output", y) else: hdf_file.set_node_attr(group, "X", validation_data) y = getattr(estimator, validation_func)(validation_data) hdf_file.set_node_attr(group, "y", y)
def _save_estimator_to_group(hdf_file: tables.File, group: tables.Group, estimator: BaseEstimator, fitted: bool): # save estimator metadata class_name = estimator.__class__.__module__ + "." + estimator.__class__.__name__ module_version = getattr(__import__(estimator.__class__.__module__), "__version__") hdf_file.set_node_attr(group, "__class_name__", class_name) hdf_file.set_node_attr(group, "__module_version__", module_version) hdf_file.set_node_attr(group, "__type__", GroupType.ESTIMATOR.name) # save params params_dict = get_params_dict(estimator) # one would expect that those params are not fitted, and fitted can be set to Flase # but some of them (for example pipeline.Pipeline.steps) do includes fitted estimators. _save_params_to_group(hdf_file, group, params_dict, fitted=fitted) if fitted: # create fit group fit_group = hdf_file.create_group(group, FIT_GROUP) hdf_file.set_node_attr(fit_group, "__type__", GroupType.FITTED_ATTRIBUTES.name) # save fit params fit_params_dict = get_fit_params_dict(estimator) _save_params_to_group(hdf_file, fit_group, fit_params_dict, fitted)
def _record_outcome( results_file: tables.File, game_result: GameResult, agent_1_version: int, agent_2_version: int, agent_1_game_number: int, agent_2_game_number: int, ) -> None: """ Record the outcome of a single game for both agents that participated in it. More specifically, this function increments the number of wins / losses / draws / failures for the two agents, and then records a summary of the game in the `all_games` table. Parameters ---------- results_file : tables.File game_result : GameResult agent_1_version : int agent_2_version : int agent_1_game_number : int agent_2_game_number : int """ for result in (game_result.result_1, game_result.result_2): agent_name = result.name outcome = result.outcome vt: tables.Table = results_file.get_node("/agents", agent_name) first = True for row in vt.iterrows(start=-1): assert ( first ), "We only want to update the last row, so this loop should only be entered once." first = False if outcome == "WIN": row["won"] += 1 elif outcome == "LOSS": row["lost"] += 1 elif outcome == "DRAW": row["drawn"] += 1 else: row["failed"] += 1 row.update() vt.flush() agent_version = [agent_1_version, agent_2_version] game_numbers = [agent_1_game_number, agent_2_game_number] result_1, result_2 = game_result.result_1, game_result.result_2 agt: tables.Table try: agt = results_file.root.all_games except tables.NoSuchNodeError: agt = results_file.create_table("/", "all_games", GameSummaryRow) gr = agt.row gr["winner"] = game_result.winner gr["moves"] = len(result_1.moves) + len(result_2.moves) gr["when/time_str"] = game_result.time_str gr["when/time_sec"] = game_result.time_sec for i, result in enumerate((result_1, result_2), 1): gr[f"agent{i}/name"] = result.name gr[f"agent{i}/version"] = agent_version[i - 1] gr[f"agent{i}/game_number"] = game_numbers[i - 1] gr[f"agent{i}/rating"] = 0.0 gr[f"agent{i}/outcome"] = result.outcome if result.move_times: gr[f"agent{i}/total_time"] = np.sum(result.move_times) gr[f"agent{i}/time_med"] = np.median(result.move_times) gr[f"agent{i}/time_max"] = np.max(result.move_times) else: gr[f"agent{i}/total_time"] = -1 gr[f"agent{i}/time_med"] = -1 gr[f"agent{i}/time_max"] = -1 if result.state_size: gr[f"agent{i}/state_size_med"] = np.median(result.state_size) gr[f"agent{i}/state_size_max"] = np.max(result.state_size) else: gr[f"agent{i}/state_size_med"] = -1 gr[f"agent{i}/state_size_max"] = -1 gr.append() agt.flush()