def __init__(self, datastore, group, name, timestamp=None, write_mode='write'): if timestamp is None: timestamp = datastore.timestamp fieldtype = f'indexedstring' super().__init__(datastore, group, name, write_mode, (('fieldtype', fieldtype), ('timestamp', timestamp), ('chunksize', datastore.chunksize))) self.fieldtype = fieldtype self.timestamp = timestamp self.datastore = datastore self.chunk_accumulated = 0 # self.values = np.zeros(self.datastore.chunksize, dtype=np.uint8) # self.indices = np.zeros(self.datastore.chunksize, dtype=np.int64) # self.ever_written = False # self.accumulated = 0 # self.value_index = 0 # self.index_index = 0 # self.chunk_accumulated = 0 if 'index' not in self.field.keys(): DataWriter.write(self.field, 'index', [0], 1)
def write_part(self, values): timestamps = np.zeros(len(values), dtype=np.float64) for i in range(len(values)): value = values[i].strip() if value == b'': timestamps[i] = 0 else: if len(value) == 32: # ts = datetime.strptime(value.decode(), '%Y-%m-%d %H:%M:%S.%f%z') ts = datetime(int(value[0:4]), int(value[5:7]), int(value[8:10]), int(value[11:13]), int(value[14:16]), int(value[17:19]), int(value[20:26])) elif len(value) == 25: # ts = datetime.strptime(value.decode(), '%Y-%m-%d %H:%M:%S%z') ts = datetime(int(value[0:4]), int(value[5:7]), int(value[8:10]), int(value[11:13]), int(value[14:16]), int(value[17:19])) elif len(value) == 19: ts = datetime(int(value[0:4]), int(value[5:7]), int(value[8:10]), int(value[11:13]), int(value[14:16]), int(value[17:19])) else: raise ValueError( f"Date field '{self.field}' has unexpected format '{value}'" ) timestamps[i] = ts.timestamp() DataWriter.write(self.field, 'values', timestamps, len(timestamps))
def write_part_raw(self, index, values): if index.dtype != np.int64: raise ValueError(f"'index' must be an ndarray of '{np.int64}'") if values.dtype not in (np.uint8, 'S1'): raise ValueError(f"'values' must be an ndarray of '{np.uint8}' or 'S1'") DataWriter.write(self.field, 'index', index, len(index)) DataWriter.write(self.field, 'values', values, len(values))
def write(self, part): DataWriter.write(self._field, self._name, part, len(part), dtype=self._dataset.dtype) self.complete()
def write_part(self, values): """ Writes a list of strings in indexed string form to a field. :param values: a list of utf8 strings """ if not self.ever_written: self.indices[0] = self.accumulated self.index_index = 1 self.ever_written = True for s in values: evalue = s.encode() for v in evalue: self.values[self.value_index] = v self.value_index += 1 if self.value_index == self.datastore.chunksize: DataWriter.write(self.field, 'values', self.values, self.value_index) self.value_index = 0 self.accumulated += 1 self.indices[self.index_index] = self.accumulated self.index_index += 1 if self.index_index == self.datastore.chunksize: DataWriter.write(self.field, 'index', self.indices, self.index_index) self.index_index = 0
def timestamp_field_constructor(session, group, name, timestamp=None, chunksize=None): field = base_field_contructor(session, group, name, timestamp, chunksize) field.attrs['fieldtype'] = 'timestamp' DataWriter.write(field, 'values', [], 0, 'float64')
def indexed_string_field_constructor(session, group, name, timestamp=None, chunksize=None): field = base_field_contructor(session, group, name, timestamp, chunksize) field.attrs['fieldtype'] = 'indexedstring' DataWriter.write(field, 'index', [], 0, 'int64') DataWriter.write(field, 'values', [], 0, 'uint8')
def numeric_field_constructor(session, group, name, nformat, timestamp=None, chunksize=None): field = base_field_contructor(session, group, name, timestamp, chunksize) field.attrs['fieldtype'] = 'numeric,{}'.format(nformat) field.attrs['nformat'] = nformat DataWriter.write(field, 'values', [], 0, nformat)
def fixed_string_field_constructor(session, group, name, length, timestamp=None, chunksize=None): field = base_field_contructor(session, group, name, timestamp, chunksize) field.attrs['fieldtype'] = 'fixedstring,{}'.format(length) field.attrs['strlen'] = length DataWriter.write(field, 'values', [], 0, "S{}".format(length))
def write_part(self, values): timestamps = np.zeros(len(values), dtype=np.float64) for i in range(len(values)): value = values[i] if value == b'': timestamps[i] = 0 else: ts = datetime.strptime(value.decode(), '%Y-%m-%d') timestamps[i] = ts.timestamp() DataWriter.write(self.field, 'values', timestamps, len(timestamps))
def flush(self): if self.value_index != 0 or 'values' not in self.field: DataWriter.write(self.field, 'values', self.values, self.value_index) self.value_index = 0 if self.index_index != 0: DataWriter.write(self.field, 'index', self.indices, self.index_index) self.index_index = 0 # self.field.attrs['fieldtype'] = self.fieldtype # self.field.attrs['timestamp'] = self.timestamp # self.field.attrs['chunksize'] = self.chunksize # self.field.attrs['completed'] = True super().flush()
def flush(self): key_strs = list() key_values = np.zeros(len(self.keys), dtype='int8') items = self.keys.items() for i, kv in enumerate(items): k, v = kv key_strs.append(k) key_values[i] = v DataWriter.write(self.field, 'key_values', key_values, len(key_values)) DataWriter.write(self.field, 'key_names', key_strs, len(key_strs), dtype=h5py.string_dtype()) # self.field.attrs['fieldtype'] = self.fieldtype # self.field.attrs['timestamp'] = self.timestamp # self.field.attrs['chunksize'] = self.chunksize # self.field.attrs['completed'] = True super().flush()
def clear(self): self._accumulated = 0 DataWriter.clear_dataset(self._field, self._index_name) DataWriter.clear_dataset(self._field, self._values_name) DataWriter.write(self._field, self._index_name, [], 0, 'int64') DataWriter.write(self._field, self._values_name, [], 0, 'uint8') self._index_dataset = self._field[self._index_name] self._values_dataset = self._field[self._values_name] self._accumulated = 0
def __init__(self, datastore, group, name, write_mode, attributes): self.trash_field = None if write_mode not in write_modes: raise ValueError(f"'write_mode' must be one of {write_modes}") if name in group: if write_mode == 'overwrite': field = group[name] trash = datastore.get_trash_group(field) dest_name = trash.name + f"/{name.split('/')[-1]}" group.move(field.name, dest_name) self.trash_field = trash[name] DataWriter.create_group(group, name, attributes) else: error = (f"Field '{name}' already exists. Set 'write_mode' to 'overwrite' " "if you want to overwrite the existing contents") raise KeyError(error) else: DataWriter.create_group(group, name, attributes) self.field = group[name] self.name = name
def write_part(self, values): """ Writes a list of strings in indexed string form to a field. :param values: a list of utf8 strings """ # if not self.ever_written: # self.indices[0] = self.accumulated # self.index_index = 1 # self.ever_written = True # # for s in values: # if isinstance(s, str): # evalue = s.encode() # else: # evalue = s # # for v in evalue: # self.values[self.value_index] = v # self.value_index += 1 # if self.value_index == self.datastore.chunksize: # DataWriter.write(self.field, 'values', self.values, self.value_index) # self.value_index = 0 # self.accumulated += 1 # self.indices[self.index_index] = self.accumulated # self.index_index += 1 # if self.index_index == self.datastore.chunksize: # DataWriter.write(self.field, 'index', self.indices, self.index_index) # self.index_index = 0 accumulated = self.field['index'][-1] indices = np.zeros(len(values), dtype=np.int64) chars = np.zeros(self.chunksize, dtype=np.uint8) index_index = 0 char_index = 0 for s in values: if isinstance(s, str): evalue = s.encode() else: evalue = s for v in evalue: chars[char_index] = v char_index += 1 if char_index == len(chars): DataWriter.write(self.field, 'values', chars, char_index) char_index = 0 accumulated += 1 indices[index_index] = accumulated index_index += 1 if index_index > 0: DataWriter.write(self.field, 'index', indices, index_index) if char_index > 0: DataWriter.write(self.field, 'values', chars, char_index)
def complete(self): if self._value_index != 0: DataWriter.write(self._field, self._values_name, self._raw_values, self._value_index) self._value_index = 0 if self._index_index != 0: if len(self._field['index']) == 0: DataWriter.write(self._field, self._index_name, [0], 1) DataWriter.write(self._field, self._index_name, self._raw_indices, self._index_index) self._index_index = 0
def categorical_field_constructor(session, group, name, nformat, key, timestamp=None, chunksize=None): field = base_field_contructor(session, group, name, timestamp, chunksize) field.attrs['fieldtype'] = 'categorical,{}'.format(nformat) field.attrs['nformat'] = nformat DataWriter.write(field, 'values', [], 0, nformat) key_values = [v for k, v in key.items()] key_names = [k for k, v in key.items()] DataWriter.write(field, 'key_values', key_values, len(key_values), 'int8') DataWriter.write(field, 'key_names', key_names, len(key_names), h5py.special_dtype(vlen=str))
def write_part(self, part): for s in part: evalue = s.encode() for v in evalue: self._raw_values[self._value_index] = v self._value_index += 1 if self._value_index == self._chunksize: DataWriter.write(self._field, self._values_name, self._raw_values, self._value_index) self._value_index = 0 self._accumulated += 1 self._raw_indices[self._index_index] = self._accumulated self._index_index += 1 if self._index_index == self._chunksize: if len(self._field['index']) == 0: DataWriter.write(self._field, self._index_name, [0], 1) DataWriter.write(self._field, self._index_name, self._raw_indices, self._index_index) self._index_index = 0
def write_part(self, values): DataWriter.write(self.field, 'values', values, len(values))
def clear(self): """ TODO: unlink the dataset """ DataWriter._clear_dataset(self._field, self._name)
def write_part(self, values): if not np.issubdtype(values.dtype, np.dtype(self.nformat)): values = values.astype(self.nformat) DataWriter.write(self.field, 'values', values, len(values))
def complete(self): DataWriter.flush(self._field[self._name])
def flush(self): DataWriter.flush(self.field) if self.trash_field is not None: del self.trash_field